[med-svn] [Git][med-team/kallisto][master] 8 commits: New upstream version 0.48.0+dfsg

Wed Jan 19 06:45:37 GMT 2022


Andrius Merkys pushed to branch master at Debian Med / kallisto


Commits:
abea4d08 by Andrius Merkys at 2022-01-19T01:08:32-05:00
New upstream version 0.48.0+dfsg
- - - - -
decabdf9 by Andrius Merkys at 2022-01-19T01:08:38-05:00
Update upstream source from tag 'upstream/0.48.0+dfsg'

Update to upstream version '0.48.0+dfsg'
with Debian dir 4179b2017828b4236a6529799a5037ecde78191f
- - - - -
fb469380 by Andrius Merkys at 2022-01-19T01:15:00-05:00
Enabling function testing.

- - - - -
d15ce768 by Andrius Merkys at 2022-01-19T01:15:28-05:00
Build with tests depend on catch2.

- - - - -
0ac5f529 by Andrius Merkys at 2022-01-19T01:16:00-05:00
Bumping copyright years.

- - - - -
08b06206 by Andrius Merkys at 2022-01-19T01:18:48-05:00
Refreshing patches, disabling Check_busOptions.seq_length_before_accessing.patch (integrated upstream).

- - - - -
651cb73e by Andrius Merkys at 2022-01-19T01:19:47-05:00
New changelog entry.

- - - - -
1e5743ab by Andrius Merkys at 2022-01-19T01:38:54-05:00
Disabling tests with bustools.

- - - - -


29 changed files:

- + .github/workflows/test.yml
- CMakeLists.txt
- debian/changelog
- debian/control
- debian/copyright
- + debian/patches/disable-failing-bustools-test.patch
- debian/patches/htslib1.3
- debian/patches/series
- debian/patches/spelling
- debian/patches/use_debian_packaged_htslib.patch
- debian/rules
- + func_tests/CMakeLists.txt
- + func_tests/runtests.sh
- src/BUSData.cpp
- src/BUSData.h
- src/BUSTools.cpp
- src/BUSTools.h
- src/GeneModel.cpp
- src/GeneModel.h
- src/Inspect.h
- src/KmerIndex.cpp
- src/KmerIndex.h
- src/MinCollector.cpp
- src/PlaintextWriter.cpp
- src/PlaintextWriter.h
- src/ProcessReads.cpp
- src/ProcessReads.h
- src/common.h
- src/main.cpp


Changes:

=====================================
.github/workflows/test.yml
=====================================
@@ -0,0 +1,28 @@
+name: Test
+
+on:
+  workflow_dispatch:
+
+jobs:
+  build-linux:
+    name: Build and test linux
+    runs-on: ubuntu-18.04
+    env:
+      RELEASE_OS: linux
+    steps:
+      - name: checkout branch
+        uses: actions/checkout at master
+      - name: Setup environment
+        id: setup
+        run: echo ::set-output name=RELEASE_VERSION::${GITHUB_REF##*/}
+      - name: make
+        run: |
+          cd ext/htslib && autoheader && autoconf && cd ../..
+          mkdir build && cd build && cmake .. -DBUILD_FUNCTESTING=ON && make && make install && cd ..
+      - name: Setup bustools
+        run: |
+          git clone https://github.com/BUStools/bustools.git
+          cd bustools && mkdir build && cd build && cmake .. && make && make install && cd ../..
+      - name: Functional testing
+        run: |
+          cd build && make test


=====================================
CMakeLists.txt
=====================================
@@ -74,5 +74,16 @@ if (BUILD_TESTING)
     add_subdirectory(unit_tests)
 endif(BUILD_TESTING)
 
+option(BUILD_FUNCTESTING "Build functional tests." OFF)
+
+if (BUILD_FUNCTESTING)
+    add_subdirectory(func_tests)
+    message("Functional testing enabled.")
+    add_custom_target(test
+    COMMAND /bin/bash ./func_tests/runtests.sh
+    DEPENDS ./src/kallisto
+    )
+endif(BUILD_FUNCTESTING)
+
 # enable_testing()
 # add_test(MainTest test/tests)


=====================================
debian/changelog
=====================================
@@ -1,3 +1,15 @@
+kallisto (0.48.0+dfsg-1) UNRELEASED; urgency=medium
+
+  * Team upload.
+  * New upstream version 0.48.0+dfsg
+  * Enabling function testing.
+  * Build with tests depend on catch2.
+  * Bumping copyright years.
+  * Refreshing patches, disabling
+    Check_busOptions.seq_length_before_accessing.patch (integrated upstream).
+
+ -- Andrius Merkys <merkys at debian.org>  Wed, 19 Jan 2022 01:19:18 -0500
+
 kallisto (0.46.2+dfsg-3) unstable; urgency=medium
 
   * Team Upload.


=====================================
debian/control
=====================================
@@ -4,6 +4,7 @@ Uploaders: Andreas Tille <tille at debian.org>
 Section: science
 Priority: optional
 Build-Depends: debhelper-compat (= 12),
+               catch2,
                cmake,
                libhdf5-dev,
                libhts-dev


=====================================
debian/copyright
=====================================
@@ -34,6 +34,7 @@ License:  public_domain.
 
 Files: debian/*
 Copyright: 2019 Andreas Tille <tille at debian.org>
+ 2022 Andrius Merkys <merkys at debian.org>
 License: BSD-2-Clause
 
 License: BSD-2-Clause


=====================================
debian/patches/disable-failing-bustools-test.patch
=====================================
@@ -0,0 +1,16 @@
+Description: Tests fail with bustools with:
+ terminate called after throwing an instance of 'std::bad_alloc'
+   what():  std::bad_alloc
+ Aborted
+Author: Andrius Merkys <merkys at debian.org>
+--- a/func_tests/runtests.sh
++++ b/func_tests/runtests.sh
+@@ -257,6 +257,8 @@
+ cmdexec "$kallisto quant -o $test_dir/quantlarge -t 12 -i $test_dir/basic7.idx --single -l 5 -s 2 $test_dir/large.fastq.gz $test_dir/large.fastq.gz $test_dir/large.fastq.gz $test_dir/large.fastq.gz $test_dir/large.fastq.gz"
+ checkcmdoutput "cat $test_dir/quantlarge/abundance.tsv" 024e56c50c774aa09e00a441512415aa
+ 
++echo "Not testing with bustools as tests fail with bustools"
++exit 0
+ 
+ ### TEST - kallisto bus ###
+ 


=====================================
debian/patches/htslib1.3
=====================================
@@ -1,8 +1,8 @@
 From: Michael R. Crusoe <michael.crusoe at gmail.com>
 Subject: Use kseq.h from htslib, not the bundled version
---- kallisto.orig/src/KmerIndex.cpp
-+++ kallisto/src/KmerIndex.cpp
-@@ -4,7 +4,7 @@
+--- a/src/KmerIndex.cpp
++++ b/src/KmerIndex.cpp
+@@ -5,7 +5,7 @@
  #include <ctype.h>
  #include <zlib.h>
  #include <unordered_set>
@@ -11,8 +11,8 @@ Subject: Use kseq.h from htslib, not the bundled version
  
  #ifndef KSEQ_INIT_READY
  #define KSEQ_INIT_READY
---- kallisto.orig/src/ProcessReads.cpp
-+++ kallisto/src/ProcessReads.cpp
+--- a/src/ProcessReads.cpp
++++ b/src/ProcessReads.cpp
 @@ -1,6 +1,6 @@
  /*
  #include <zlib.h>
@@ -30,8 +30,8 @@ Subject: Use kseq.h from htslib, not the bundled version
  #include "PseudoBam.h"
  #include "Fusion.hpp"
  #include "BUSData.h"
---- kallisto.orig/src/ProcessReads.h
-+++ kallisto/src/ProcessReads.h
+--- a/src/ProcessReads.h
++++ b/src/ProcessReads.h
 @@ -2,7 +2,7 @@
  #define KALLISTO_PROCESSREADS_H
  
@@ -41,8 +41,8 @@ Subject: Use kseq.h from htslib, not the bundled version
  #include <string>
  #include <vector>
  #include <unordered_map>
---- kallisto.orig/unit_tests/test_kmerhashtable.cpp
-+++ kallisto/unit_tests/test_kmerhashtable.cpp
+--- a/unit_tests/test_kmerhashtable.cpp
++++ b/unit_tests/test_kmerhashtable.cpp
 @@ -13,7 +13,7 @@
  #include "KmerHashTable.h"
  


=====================================
debian/patches/series
=====================================
@@ -1,5 +1,6 @@
 use_debian_packaged_htslib.patch
 spelling
 htslib1.3
-Check_busOptions.seq_length_before_accessing.patch
+#Check_busOptions.seq_length_before_accessing.patch
 gcc11.patch
+disable-failing-bustools-test.patch


=====================================
debian/patches/spelling
=====================================
@@ -1,8 +1,8 @@
 From: Michael R. Crusoe
 Subject: fix spelling typo
---- kallisto.orig/src/Inspect.h
-+++ kallisto/src/Inspect.h
-@@ -249,7 +249,7 @@
+--- a/src/Inspect.h
++++ b/src/Inspect.h
+@@ -250,7 +250,7 @@
        const Contig& c = index.dbGraph.contigs[i];
  
        if (c.seq.size() != c.length + k-1) {


=====================================
debian/patches/use_debian_packaged_htslib.patch
=====================================
@@ -4,7 +4,7 @@ Description: Adapt build system to Debian packaged libhts
 
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
-@@ -34,7 +34,7 @@ ELSE(LINK MATCHES shared)
+@@ -41,7 +41,7 @@
      message("shared build")
  ENDIF(LINK MATCHES static)
  
@@ -13,7 +13,7 @@ Description: Adapt build system to Debian packaged libhts
  include(ExternalProject)
  ExternalProject_Add(htslib
      PREFIX ${PROJECT_SOURCE_DIR}/ext/htslib
-@@ -47,7 +47,7 @@ ExternalProject_Add(htslib
+@@ -54,7 +54,7 @@
  )
  
  include_directories(${htslib_PREFIX}/src/htslib)
@@ -24,7 +24,7 @@ Description: Adapt build system to Debian packaged libhts
  # add_compile_options(-Wdeprecated-register)
 --- a/src/CMakeLists.txt
 +++ b/src/CMakeLists.txt
-@@ -3,7 +3,9 @@ file(GLOB headers *.h *.hpp)
+@@ -3,7 +3,9 @@
  
  list(REMOVE_ITEM sources main.cpp)
  
@@ -34,7 +34,7 @@ Description: Adapt build system to Debian packaged libhts
  
  add_library(kallisto_core ${sources} ${headers})
  target_include_directories(kallisto_core PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
-@@ -11,7 +13,11 @@ target_include_directories(kallisto_core
+@@ -11,7 +13,11 @@
  add_executable(kallisto main.cpp)
  
  find_package( Threads REQUIRED )
@@ -46,7 +46,7 @@ Description: Adapt build system to Debian packaged libhts
  
  if(LINK MATCHES static)
      set(BUILD_SHARED_LIBS OFF)
-@@ -56,4 +62,4 @@ else()
+@@ -62,4 +68,4 @@
  endif(LINK MATCHES static)
  
  


=====================================
debian/rules
=====================================
@@ -5,19 +5,11 @@
 export DEB_BUILD_MAINT_OPTIONS=hardening=+all
 
 CMAKE_EXTRA_FLAGS += -DDEBIAN_BUILD=1 \
-                     -DLINK=shared
-
-# Build time test requires https://github.com/philsquared/Catch
-#                     -DBUILD_TESTING=ON \
+                     -DLINK=shared \
+                     -DBUILD_FUNCTESTING=1
 
 %:
 	dh $@
 
 override_dh_auto_configure:
 	dh_auto_configure -- $(CMAKE_EXTRA_FLAGS)
-
-### When overriding auto_test make sure DEB_BUILD_OPTIONS will be respected
-#override_dh_auto_test:
-#ifeq (,$(filter nocheck,$(DEB_BUILD_OPTIONS)))
-#	do_stuff_for_testing
-#endif


=====================================
func_tests/CMakeLists.txt
=====================================
@@ -0,0 +1,4 @@
+project(FuncTests)
+
+file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/runtests.sh
+     DESTINATION ${CMAKE_CURRENT_BINARY_DIR})


=====================================
func_tests/runtests.sh
=====================================
@@ -0,0 +1,296 @@
+#!/bin/bash
+
+### Setup ###
+
+kallisto="./src/kallisto"
+bustools="bustools"
+test_dir="./func_tests"
+
+cmdexec() {
+	cmd="$1"
+	expect_fail="$2"
+	printf "$cmd\n"
+	if eval "$cmd 2> /dev/null 1> /dev/null"; then
+		if [ "$expect_fail" = "1" ]; then
+			printf "^[Failed - Returned normally but expected failure]\n"
+			exit 1
+		else
+			printf "^[OK]\n"
+		fi
+	else
+		if [ "$expect_fail" = "1" ]; then
+			printf "^[OK - Exited non-zero as expected]\n"
+		else
+			printf "^[Failed]\n"
+			exit 1
+		fi
+	fi
+}
+
+checkcmdoutput() {
+	cmd="$1"
+	correct_md5="$2"
+	printf "$cmd\n"
+	output_md5=$(eval "$cmd"|md5sum|awk '{ print $1 }')
+	if [ "$output_md5" = "$correct_md5" ]; then
+		printf "^[Output OK]\n"
+	else
+		printf "^[Output incorrect! Expected: "
+		printf "$correct_md5"
+		printf " Actual: "
+		printf "$output_md5"
+		printf "]\n"
+		exit 1
+	fi	
+}
+
+### FASTA files ###
+
+echo ">t1
+ACGTGATGAGTGAGTCAGT
+>t2
+ACGTGATGAGATGATGAGTCAGT
+>t3
+ACGTGATGTGAGTCAGT
+>t4
+ccccaaaaaa
+>t5
+ttttttgggg" > $test_dir/simple.fasta
+
+echo ">t1
+AAANNNTTTKK
+>t2
+NNNCCCCNCCC" > $test_dir/nonATCG.fasta
+
+echo ">t1
+AAAAAAAAAAAAAAAA
+>t2
+CCCAAAAAAAAAAAAA
+>t3
+CCCGGAAAAAAAAAAA
+>t4
+AAAAAAAAAAAAATTT
+>t5
+TAAAAAAAAAAAAAAT" > $test_dir/polyA.fasta
+
+echo ">t1
+AAATTTCCCGGGAAATTTCCCGGG
+>t2
+AAAtttCCCgggAAAtttCCCGGG
+>t3
+AAATTTCCCGGGAAATTTCCCGGG
+>t4
+AAATTTCCCGGGAAATTTCCCGGG
+>t4
+CCCCCCCCCCCCCCCCCCCCCCCC
+>t5
+CCCCCCCCCCCCCCCCCCCCCCCC
+>t5
+CCCCCCCCCCCCCCCCCCCCCCCC" > $test_dir/duplicates.fasta
+
+### FASTQ files ###
+
+cat $test_dir/simple.fasta $test_dir/nonATCG.fasta $test_dir/polyA.fasta $test_dir/duplicates.fasta | \
+	awk '{if(NR%2){ print "\@" $0 } else { printf "%s\n+\n",$0; i=0; while (i++ < length($0)) printf "J"; printf "\n" } }' \
+	| gzip > $test_dir/small.fastq.gz
+
+
+cat /dev/null > $test_dir/medium.fastq.gz
+for i in {1..700}; do cat $test_dir/small.fastq.gz >> $test_dir/medium.fastq.gz; done
+
+cat /dev/null > $test_dir/large.fastq.gz
+for i in {1..100}; do cat $test_dir/medium.fastq.gz >> $test_dir/large.fastq.gz; done
+
+rm $test_dir/medium.fastq.gz
+
+# Paired-end reads for basic7.idx index
+
+echo "@t1
+ACGTGATG
++
+JJJJJJJJ
+ at t2
+ACGTGATG
++
+JJJJJJJJ
+ at t3
+CCCCCCCC
++
+JJJJJJJJ
+ at t4
+ccccaaaaaa
++
+JJJJJJJJJJ
+ at t5
+ttttttgggg
++
+JJJJJJJJJJ
+" | gzip > $test_dir/simple_pair1.fastq.gz
+
+echo "@t1
+GAGTCAGT
++
+JJJJJJJJ
+ at t2
+TGAGTCAG
++
+JJJJJJJJ
+ at t3
+ACGTGATG
++
+JJJJJJJJ
+ at t4
+ccccaaaaaa
++
+JJJJJJJJJJ
+ at t5
+ccccaaaaaa
++
+JJJJJJJJJJ
+" | gzip > $test_dir/simple_pair2.fastq.gz
+
+# Barcode+UMI FastQ file for 10Xv3
+
+echo "@t1
+GCCCCCCCCCCCCCCGTAAAAAAAAAATGG
++
+JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ
+ at t2
+GCCCCCCCCCCCCCCGTAAAAAAAAAATGG
++
+JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ
+ at t3
+GCCCCCCCCCCCCCCGTAAAAttAAAAtGG
++
+JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ
+ at t4
+GCCCCCCggCCCCCCGtAAAAttAAAAtGG
++
+JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ
+ at t5
+GCCCCCCttCCCCCCGtAAAAggAAAAtGG
++
+JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ
+" | gzip > $test_dir/10xv3.fastq.gz
+
+
+### TEST - kallisto index ###
+
+echo "[INDEX]"
+
+# Test k=7
+
+cmdexec "$kallisto index -i $test_dir/basic7.idx -k 7 $test_dir/simple.fasta"
+checkcmdoutput "$kallisto inspect ./func_tests/basic7.idx 2>&1|head -4" 356d498c030cb270a124ca06ad42a9d5
+
+# Test k=9
+
+cmdexec "$kallisto index -i $test_dir/basic9.idx -k 9 $test_dir/simple.fasta"
+checkcmdoutput "$kallisto inspect ./func_tests/basic9.idx 2>&1|head -4" 9f9246d5073d8e6ba90b8644e4ed9cc9
+
+# Test non-ATCG bases
+
+cmdexec "$kallisto index -i $test_dir/nonATCG.idx -k 5 $test_dir/nonATCG.fasta"
+checkcmdoutput "$kallisto inspect ./func_tests/nonATCG.idx 2>&1|head -4" 5fcab956573ca37c2220d154e7760db2
+
+# Test polyA truncation
+
+cmdexec "$kallisto index -i $test_dir/polyA.idx -k 5 $test_dir/polyA.fasta"
+checkcmdoutput "$kallisto inspect ./func_tests/polyA.idx 2>&1|head -4" 8fe4f599c55a62f87e56067ccdbf865e
+
+# Test k = even number (should fail)
+
+cmdexec "$kallisto index -i $test_dir/basic_fail.idx -k 10 $test_dir/simple.fasta" 1
+
+# Test duplicate transcript names (should fail)
+
+cmdexec "$kallisto index -i $test_dir/duplicates_fail.idx -k 11 $test_dir/duplicates.fasta" 1
+
+# Test duplicate transcript names with --make-unique
+
+cmdexec "$kallisto index -i $test_dir/duplicates.idx -k 11 --make-unique $test_dir/duplicates.fasta"
+checkcmdoutput "$kallisto inspect ./func_tests/duplicates.idx 2>&1|head -4" b57f37128e2f2e66775aa811ed994cec
+
+
+### TEST - kallisto quant ###
+
+# Test single-end small.fastq.gz using various indices
+
+cmdexec "$kallisto quant -o $test_dir/quantbasic -i $test_dir/basic7.idx --single -l 5 -s 2 $test_dir/small.fastq.gz"
+checkcmdoutput "cat $test_dir/quantbasic/abundance.tsv" f1c8927dc29b943758242902d5c45a86
+
+cmdexec "$kallisto quant -o $test_dir/quantnonATCG -i $test_dir/nonATCG.idx --single -l 5 -s 2 $test_dir/small.fastq.gz"
+checkcmdoutput "cat $test_dir/quantnonATCG/abundance.tsv" 9272185ff011f9a840c29c5c40a2d390
+
+cmdexec "$kallisto quant -o $test_dir/quantpolyA -i $test_dir/polyA.idx --single -l 5 -s 2 $test_dir/small.fastq.gz"
+checkcmdoutput "cat $test_dir/quantpolyA/abundance.tsv" 745539c18d4ff07bf6de0938563f2362
+
+cmdexec "$kallisto quant -o $test_dir/quantduplicates -i $test_dir/duplicates.idx --single -l 5 -s 2 $test_dir/small.fastq.gz"
+checkcmdoutput "cat $test_dir/quantduplicates/abundance.tsv" 49a62b913a155598dd6894a2f0eb0905
+
+# Test single-end small.fastq.gz strand-specificity
+
+cmdexec "$kallisto quant -o $test_dir/quantbasicfr -i $test_dir/basic7.idx --single --fr-stranded -l 5 -s 2 $test_dir/small.fastq.gz"
+checkcmdoutput "cat $test_dir/quantbasicfr/abundance.tsv" ce2ed5a3a1bab582fcb62dc02f4d9323
+
+cmdexec "$kallisto quant -o $test_dir/quantbasicrf -i $test_dir/basic7.idx --single --rf-stranded -l 5 -s 2 $test_dir/small.fastq.gz"
+checkcmdoutput "cat $test_dir/quantbasicrf/abundance.tsv" 017e8ba77d7e7b39a60bb7c047e620dc
+
+# Test paired-end small.fastq.gz
+
+cmdexec "$kallisto quant -o $test_dir/quantbasicpaired -i $test_dir/basic7.idx $test_dir/simple_pair1.fastq.gz $test_dir/simple_pair2.fastq.gz"
+checkcmdoutput "cat $test_dir/quantbasicpaired/abundance.tsv" 1cf9cd508c04eff7cbda6e74cc1e44ea
+
+# Test paired-end small.fastq.gz with multiple pairs of files and with strand-specificity
+
+cmdexec "$kallisto quant -o $test_dir/quantbasicpairedfr -i $test_dir/basic7.idx --fr-stranded $test_dir/simple_pair1.fastq.gz $test_dir/simple_pair2.fastq.gz"
+checkcmdoutput "cat $test_dir/quantbasicpairedfr/abundance.tsv" bc6a75291c2eb661a57b55de534dd8f0
+
+cmdexec "$kallisto quant -o $test_dir/quantbasicpairedrf -i $test_dir/basic7.idx --rf-stranded $test_dir/simple_pair1.fastq.gz $test_dir/simple_pair2.fastq.gz"
+checkcmdoutput "cat $test_dir/quantbasicpairedrf/abundance.tsv" dd1ed6ed8fb393616817a3dd506f821f
+
+cmdexec "$kallisto quant -o $test_dir/quantbasicpairedmultfr -i $test_dir/basic7.idx --fr-stranded $test_dir/simple_pair1.fastq.gz $test_dir/simple_pair2.fastq.gz $test_dir/simple_pair2.fastq.gz $test_dir/simple_pair1.fastq.gz"
+checkcmdoutput "cat $test_dir/quantbasicpairedmultfr/abundance.tsv" f810d28aed3969514f1d21120a6fc825
+
+# Test multiple large fastq files with more threads
+
+cmdexec "$kallisto quant -o $test_dir/quantlarge -t 12 -i $test_dir/basic7.idx --single -l 5 -s 2 $test_dir/large.fastq.gz $test_dir/large.fastq.gz $test_dir/large.fastq.gz $test_dir/large.fastq.gz $test_dir/large.fastq.gz"
+checkcmdoutput "cat $test_dir/quantlarge/abundance.tsv" 024e56c50c774aa09e00a441512415aa
+
+
+### TEST - kallisto bus ###
+
+if ! command -v bustools &> /dev/null
+then
+    echo "Error: bustools could not be found"
+    exit 1
+fi
+
+# Try custom "magic string" for -x with multiple large files and many threads
+
+cmdexec "$kallisto bus -o $test_dir/buslarge -t 12 -i $test_dir/basic7.idx -x 0,0,2:0,2,6:1,0,0 $test_dir/large.fastq.gz $test_dir/large.fastq.gz $test_dir/large.fastq.gz $test_dir/large.fastq.gz $test_dir/large.fastq.gz $test_dir/large.fastq.gz"
+cmdexec "$bustools sort -o $test_dir/buslarge/output.s.bus -t 12 $test_dir/buslarge/output.bus"
+checkcmdoutput "bustools text -p $test_dir/buslarge/output.s.bus" f3db94e1c983a1d51b8f8525bb1a5beb
+
+# Try a simple 10XV3 (with unstranded pseudoalignment)
+
+cmdexec "$kallisto bus -o $test_dir/bus10xv3 -t 1 -i $test_dir/basic7.idx -x 10XV3 --unstranded $test_dir/10xv3.fastq.gz $test_dir/small.fastq.gz"
+cmdexec "$bustools sort -o $test_dir/bus10xv3/output.s.bus -t 12 $test_dir/bus10xv3/output.bus"
+checkcmdoutput "$bustools text -p $test_dir/bus10xv3/output.s.bus" f8f76f64d3ef351057f3bae016e2fa8a
+
+
+# Try processing demultiplexed bulk RNA-seq with strand-specificity with EM and kallisto quant-tcc (and compare with quant) 
+
+echo "t1	g1
+t2	g1
+t3	g2
+t4	g2
+t5	g3" > $test_dir/t2g_test.txt # Make a transcript-to-gene mapping file
+cmdexec "$kallisto bus --fr-stranded -o $test_dir/busbulklarge -t 12 -i $test_dir/basic7.idx $test_dir/large.fastq.gz"
+cmdexec "$bustools sort -o $test_dir/busbulklarge/output.s.bus -t 12 $test_dir/busbulklarge/output.bus"
+cmdexec "$bustools count --cm -m -o $test_dir/busbulklarge/counts_tcc/ -g $test_dir/t2g_test.txt -t $test_dir/busbulklarge/transcripts.txt -e $test_dir/busbulklarge/matrix.ec $test_dir/busbulklarge/output.bus"
+cmdexec "$kallisto quant-tcc -o $test_dir/quant_tcc_test1/ -l 5 -s 2 -i $test_dir/busbulklarge/index.saved -g $test_dir/t2g_test.txt -e $test_dir/busbulklarge/counts_tcc/output.ec.txt $test_dir/busbulklarge/counts_tcc/output.mtx"
+checkcmdoutput "cat $test_dir/quant_tcc_test1/matrix.abundance.gene.tpm.mtx" ed7c7fa08e283cc6a1b136cc0e1ab039
+
+
+


=====================================
src/BUSData.cpp
=====================================
@@ -50,4 +50,17 @@ std::string binaryToString(uint64_t x, size_t len) {
     s.at(i) = c;
   }
   return std::move(s);
-}
\ No newline at end of file
+}
+
+int hamming(uint64_t a, uint64_t b, size_t len) {
+  uint64_t df = a^b;
+  int d = 0;
+  size_t sh = len-1;
+  for (size_t i = 0; i < len; ++i) {
+    if (((df>>(2*sh)) & 0x03ULL) != 0) {
+      d++;
+    }    
+    sh--;
+  }
+  return d;
+}


=====================================
src/BUSData.h
=====================================
@@ -38,4 +38,5 @@ struct BUSData {
 uint64_t stringToBinary(const std::string &s, uint32_t &flag);
 uint64_t stringToBinary(const char* s, const size_t len, uint32_t &flag);
 std::string binaryToString(uint64_t x, size_t len);
-#endif // KALLISTO_BUSDATA_H
\ No newline at end of file
+int hamming(uint64_t a, uint64_t b, size_t len);
+#endif // KALLISTO_BUSDATA_H


=====================================
src/BUSTools.cpp
=====================================
@@ -1,6 +1,6 @@
 #include <vector>
 #include <fstream>
-#include "BUSData.h"
+#include "BUSTools.h"
 
 void writeBUSHeader(std::ofstream &out, int bclen, int umilen) {
   out.write("BUS\0", 4);
@@ -19,4 +19,28 @@ void writeBUSData(std::ofstream &out, const std::vector<BUSData> &bv) {
       out.write((char*)(&b), sizeof(b));
     }
   }
+}
+
+void writeBUSMatrix(const std::string &filename,
+                    std::vector<std::vector<std::pair<int,int>>> &data, int cols) {
+  std::ofstream of;
+  of.open(filename.c_str(), std::ios::out | std::ios::binary);
+  writeBUSHeader(of, BUSFORMAT_FAKE_BARCODE_LEN, 1);
+  if (!data.empty()) { // reduntant  
+    for (size_t j = 0; j < data.size(); j++) {
+      const auto &v = data[j];
+      for (size_t i = 0; i < v.size(); i++) {
+        if (v[i].second != 0 && v[i].first != -1) {
+          BUSData b;
+          b.barcode = j;
+          b.flags = 0;
+          b.UMI = -1;
+          b.ec = v[i].first;
+          b.count = v[i].second;
+          of.write((char*)(&b), sizeof(b));
+        }
+      }
+    }
+  }
+  of.close();
 }
\ No newline at end of file


=====================================
src/BUSTools.h
=====================================
@@ -5,7 +5,11 @@
 #include <vector>
 #include "BUSData.h"
 
+const uint32_t BUSFORMAT_FAKE_BARCODE_LEN = 16;
+
 void writeBUSData(std::ofstream &out, const std::vector<BUSData> &bv);
 void writeBUSHeader(std::ofstream &out, int bclen, int umilen);
+void writeBUSMatrix(const std::string &filename,
+                    std::vector<std::vector<std::pair<int,int>>> &data, int cols);
 
 #endif // KALLISTO_BUSTOOLS_H
\ No newline at end of file


=====================================
src/GeneModel.cpp
=====================================
@@ -581,3 +581,57 @@ void Transcriptome::parseGTF(const std::string &gtf_fn, const KmerIndex& index,
     std::cerr << "Warning: " << num_trans_missing << " transcripts were defined in GTF file, but not in the index" << std::endl;
   }
 }
+
+void Transcriptome::parseGeneMap(const std::string &genemap_fn, const KmerIndex& index, const ProgramOptions& options) {
+  for (int i = 0; i < index.num_trans; i++) {
+    TranscriptModel tr;
+    tr.id = i;
+    tr.chr = -1;
+    tr.gene_id = -1;
+    tr.strand = true;
+    transcripts.push_back(std::move(tr));
+    trNameToId.insert({index.target_names_[i], i});
+  }
+  
+  std::ifstream ingenemap((genemap_fn));
+  if (ingenemap.is_open()) {
+    std::string line;
+    while (getline(ingenemap, line)) {
+      if (line.empty()) {
+        continue;
+      }
+      std::stringstream ss(line);
+      std::string txp, gene_name, gene_common_name;
+      ss >> txp >> gene_name >> gene_common_name;
+      if (gene_name.empty()) {
+        std::cerr << "Error: No gene associated with transcript " << txp << " in " << genemap_fn << std::endl;
+        exit(1);
+      }
+      auto it = trNameToId.find(txp);
+      if (it != trNameToId.end()) {
+        TranscriptModel tmodel;
+        tmodel.id = it->second;
+        tmodel.length = index.target_lens_[tmodel.id];
+        auto it2 = geneNameToId.find(gene_name);
+        if (it2 == geneNameToId.end()) {
+          GeneModel gmodel;
+          gmodel.id = genes.size();
+          gmodel.name = gene_name;
+          gmodel.commonName = gene_common_name;
+          geneNameToId.insert({gmodel.name, gmodel.id});
+          genes.push_back(std::move(gmodel));
+          tmodel.gene_id = gmodel.id;
+        } else {
+          tmodel.gene_id = it2->second;
+        }
+        transcripts[tmodel.id] = std::move(tmodel);
+      } else {
+        std::cerr << "Error: Invalid transcript: " << txp << " in " << genemap_fn << std::endl;
+        exit(1);
+      }
+    }
+  } else {
+    std::cerr << "Error: could not open file " << genemap_fn << std::endl;
+    exit(1);
+  }
+}


=====================================
src/GeneModel.h
=====================================
@@ -99,6 +99,7 @@ struct Transcriptome {
   void loadChromosomes(const std::string &chrom_fn);
   bool translateTrPosition(const int tr, const int pos, const int length,  bool strand, TranscriptAlignment &aln) const;
   void parseGTF(const std::string &gtf_fn, const KmerIndex& index, const ProgramOptions& options, bool guessChromosomes);
+  void parseGeneMap(const std::string &genemap_fn, const KmerIndex& index, const ProgramOptions& options);
   int addGTFLine(const std::string& line, const KmerIndex& index, bool guessChromosome);
   void loadTranscriptome(const KmerIndex& index, std::istream &in, const ProgramOptions& options);
 


=====================================
src/Inspect.h
=====================================
@@ -3,6 +3,7 @@
 
 #include <iostream>
 #include <unordered_set>
+#include <limits>
 #include "KmerIndex.h"
 #include "GeneModel.h"
 


=====================================
src/KmerIndex.cpp
=====================================
@@ -1,6 +1,7 @@
 #include "KmerIndex.h"
 #include <algorithm>
 #include <random>
+#include <sstream>
 #include <ctype.h>
 #include <zlib.h>
 #include <unordered_set>
@@ -782,6 +783,12 @@ bool KmerIndex::fwStep(Kmer km, Kmer& end) const {
 }
 
 void KmerIndex::load(ProgramOptions& opt, bool loadKmerTable) {
+  if (opt.index.empty() && !loadKmerTable) {
+    // Make an index from transcript and EC files
+    loadTranscriptsFromFile(opt);
+    loadECsFromFile(opt);
+    return;
+  }
 
   std::string& index_in = opt.index;
   std::ifstream in;
@@ -967,8 +974,73 @@ void KmerIndex::load(ProgramOptions& opt, bool loadKmerTable) {
   buffer=nullptr;
   
   in.close();
+  
+  if (!opt.ecFile.empty()) {
+    loadECsFromFile(opt);
+  }
+}
+
+void KmerIndex::loadECsFromFile(const ProgramOptions& opt) {
+  ecmap.clear();
+  ecmapinv.clear();
+  int i = 0;
+  std::ifstream in((opt.ecFile));
+  if (in.is_open()) {
+    std::string line;
+    while (getline(in, line)) {
+      std::stringstream ss(line);
+      int ec;
+      std::string transcripts;
+      ss >> ec >> transcripts;
+      if (i != ec) {
+        std::cerr << "Error: equivalence class file has a misplaced equivalence class."
+                  << " Found " << ec << ", expected " << i << std::endl;
+        exit(1);
+      }
+      std::vector<int> tmp_vec;
+      std::stringstream ss2(transcripts);
+      while(ss2.good()) {
+        std::string tmp_ecval;
+        getline(ss2, tmp_ecval, ',');
+        int tmp_ecval_num = std::atoi(tmp_ecval.c_str());
+        if (tmp_ecval_num < 0 || tmp_ecval_num >= num_trans) {
+          std::cerr << "Error: equivalence class file has invalid value: " 
+                    << tmp_ecval << " in " << transcripts << std::endl;
+          exit(1);
+        }
+        tmp_vec.push_back(tmp_ecval_num);
+      }
+      ecmap.push_back(tmp_vec); // copy
+      ecmapinv.insert({std::move(tmp_vec), i}); // move
+      i++;
+    }
+  } else {
+    std::cerr << "Error: could not open file " << opt.ecFile << std::endl;
+    exit(1);
+  }
+  std::cerr << "[index] number of equivalence classes loaded from file: "
+            << pretty_num(ecmap.size()) << std::endl;
 }
 
+void KmerIndex::loadTranscriptsFromFile(const ProgramOptions& opt) {
+  target_names_.clear();
+  int i = 0;
+  std::ifstream in((opt.transcriptsFile));
+  if (in.is_open()) {
+    std::string txp;
+    while (in >> txp) {
+      target_names_.push_back(txp);
+      i++;
+    }
+  } else {
+    std::cerr << "Error: could not open file " << opt.transcriptsFile << std::endl;
+    exit(1);
+  }
+  num_trans = i;
+  target_lens_.assign(num_trans, 0);
+  std::cerr << "[index] number of targets loaded from file: "
+            << pretty_num(num_trans) << std::endl;
+}
 
 int KmerIndex::mapPair(const char *s1, int l1, const char *s2, int l2, int ec) const {
   bool d1 = true;


=====================================
src/KmerIndex.h
=====================================
@@ -121,6 +121,8 @@ struct KmerIndex {
   // load methods
   void load(ProgramOptions& opt, bool loadKmerTable = true);
   void loadTranscriptSequences() const;
+  void loadECsFromFile(const ProgramOptions& opt);
+  void loadTranscriptsFromFile(const ProgramOptions& opt);
   void clear();
 
   // positional information


=====================================
src/MinCollector.cpp
=====================================
@@ -1,5 +1,6 @@
 #include "MinCollector.h"
 #include <algorithm>
+#include <limits>
 
 // utility functions
 


=====================================
src/PlaintextWriter.cpp
=====================================
@@ -64,6 +64,53 @@ void plaintext_writer(
   of.close();
 }
 
+void plaintext_writer_gene(
+    const std::string& out_name,
+    const std::vector<std::string>& targ_ids,
+    const std::vector<double>& alpha,
+    const std::vector<double>& eff_lens,
+    const Transcriptome& model
+){
+  
+  std::ofstream of;
+  of.open( out_name );
+  
+  if (!of.is_open()) {
+    std::cerr << "Error: Couldn't open file: " << out_name << std::endl;
+    
+    exit(1);
+  }
+  
+  std::vector<double> gc;
+  std::vector<double> gc_tpm;
+  gc.assign(model.genes.size(), 0.0);
+  gc_tpm.assign(model.genes.size(), 0.0);
+  auto tpm = counts_to_tpm(alpha, eff_lens);
+  for (int i = 0; i < alpha.size(); i++) {
+    if (alpha[i] > 0.0) {
+      int g_id = model.transcripts[i].gene_id;
+      if (g_id != -1) {
+        gc[g_id] += alpha[i];
+        gc_tpm[g_id] += tpm[i];
+      }
+    }
+  }
+
+  of << "gene_id" << "\t"
+     << "gene_name" << "\t"
+     << "est_counts" << "\t"
+     << "tpm" << "\n";
+
+  for (int i = 0; i < gc.size(); i++) {
+    of << model.genes[i].name << '\t'
+       << model.genes[i].commonName << '\t'
+       << gc[i] << '\t'
+       << gc_tpm[i] << "\n";
+  }
+
+  of.close();
+}
+
 std::string to_json(const std::string& id, const std::string& val, bool quote,
     bool comma, int level) {
   std::string out;
@@ -239,7 +286,7 @@ void writeFLD(const std::string &filename, const std::vector<std::pair<double, d
   of.close();
 }
 
-void writeGeneList(const std::string &filename, const Transcriptome& model) {
+void writeGeneList(const std::string &filename, const Transcriptome& model, bool writeNamesOnly) {
   std::ofstream of;
   of.open(filename.c_str(), std::ios::out);
   if (!of.is_open()) {
@@ -247,7 +294,11 @@ void writeGeneList(const std::string &filename, const Transcriptome& model) {
     exit(1);
   }
   for (int j = 0; j < model.genes.size(); j++) {
-    of << j << "\t" << model.genes[j].name << "\t" << model.genes[j].commonName << "\n";
+    if (writeNamesOnly) {
+      of << model.genes[j].name << "\n";
+    } else {
+      of << j << "\t" << model.genes[j].name << "\t" << model.genes[j].commonName << "\n";
+    }
   }
   of.close();
 }


=====================================
src/PlaintextWriter.h
=====================================
@@ -21,6 +21,14 @@ void plaintext_writer(
     const std::vector<int>& lens
     );
 
+void plaintext_writer_gene(
+    const std::string& out_name,
+    const std::vector<std::string>& targ_ids,
+    const std::vector<double>& alpha,
+    const std::vector<double>& eff_lens,
+    const Transcriptome& model
+);
+
 std::string to_json(const std::string& id, const std::string& val, bool quote,
     bool comma = true, int level = 1);
 
@@ -53,7 +61,10 @@ void writeECList(
 
 void writeFLD(const std::string &filename, const std::vector<std::pair<double, double>> &flds);
 
-void writeGeneList(const std::string &filename, const Transcriptome& model);
+void writeGeneList(const std::string &filename, const Transcriptome& model, bool writeNamesOnly = false);
+
+std::vector<double> counts_to_tpm(const std::vector<double>& est_counts,
+                                  const std::vector<double>& eff_lens);
 
 template<typename T>
 void writeSparseBatchMatrix(


=====================================
src/ProcessReads.cpp
=====================================
@@ -26,6 +26,7 @@
 #include "BUSData.h"
 #include "BUSTools.h"
 #include <htslib/kstring.h>
+#include <unordered_set>
 
 
 void printVector(const std::vector<int>& v, std::ostream& o) {
@@ -352,6 +353,23 @@ void MasterProcessor::processReads() {
     }
   } else if (opt.bus_mode) {
     std::vector<std::thread> workers;
+    parallel_bus_read = opt.threads > 4 && opt.files.size() > opt.busOptions.nfiles && !opt.num && !opt.pseudobam;
+    if (parallel_bus_read) {
+      delete SR;
+      SR = nullptr;
+      assert(opt.files.size() % opt.busOptions.nfiles == 0);
+      int nbatches = opt.files.size() / opt.busOptions.nfiles;
+      std::vector<std::mutex> mutexes(nbatches);
+      parallel_bus_reader_locks.swap(mutexes);
+      for (int i = 0; i < nbatches; i++) {
+        FastqSequenceReader fSR(opt);
+        fSR.files.erase(fSR.files.begin(), fSR.files.begin()+opt.busOptions.nfiles*i);
+        fSR.files.erase(fSR.files.begin()+opt.busOptions.nfiles, fSR.files.end());
+        assert(fSR.files.size() == opt.busOptions.nfiles);
+        FSRs.push_back(std::move(fSR));
+      }
+    }
+    
     for (int i = 0; i < opt.threads; i++) {
       workers.emplace_back(std::thread(BUSProcessor(index,opt,tc,*this)));
     }
@@ -375,6 +393,46 @@ void MasterProcessor::processReads() {
     }
 
   
+  } else if (opt.batch_mode && opt.batch_bus) {
+    std::vector<std::thread> workers;
+    int num_ids = opt.batch_ids.size();
+    int id =0;
+    while (id < num_ids) {
+      // TODO: put in thread pool
+      workers.clear();
+      int nt = std::min(opt.threads, (num_ids - id));
+      if (opt.pseudo_read_files_supplied) { // pseudo_read_files_supplied: read using SR (no batch identifier) rather than batchSR
+        for (int i = 0; i < opt.threads; i++) {
+          workers.emplace_back(std::thread(BUSProcessor(index, opt, tc, *this, id,0)));
+        }
+        for (int i = 0; i < opt.threads; i++) {
+          workers[i].join();
+        }
+        id++;
+        assert(id == num_ids);
+      } else {
+        for (int i = 0; i < nt; i++,id++) {
+          workers.emplace_back(std::thread(BUSProcessor(index, opt, tc, *this, id,i)));
+        }
+        // let the workers do their thing
+        for (int i = 0; i < nt; i++) {
+          workers[i].join(); //wait for them to finish
+        }
+      }
+    }
+    
+    // now handle the modification of the mincollector
+    for (int i = 0; i < bus_ecmap.size(); i++) {
+      auto &u = bus_ecmap[i];
+      int ec = index.ecmapinv.size();
+      auto it = bus_ecmapinv.find(u);
+      if (it->second != ec) {
+        std::cout << "Error" << std::endl;
+        exit(1);
+      }      
+      index.ecmapinv.insert({u,ec});
+      index.ecmap.push_back(u);
+    }
   } else if (opt.batch_mode) {    
     std::vector<std::thread> workers;
     int num_ids = opt.batch_ids.size();
@@ -385,13 +443,24 @@ void MasterProcessor::processReads() {
       workers.clear();
       int nt = std::min(opt.threads, (num_ids - id));
       int first_id = id;
-      for (int i = 0; i < nt; i++,id++) {
-        tmp_bc[i].assign(tc.counts.size(), 0);     
-        workers.emplace_back(std::thread(ReadProcessor(index, opt, tc, *this, id,i)));
-      }
-      
-      for (int i = 0; i < nt; i++) {
-        workers[i].join();
+      if (opt.pseudo_read_files_supplied) {
+        for (int i = 0; i < opt.threads; i++) {
+          tmp_bc[i].assign(tc.counts.size(), 0);     
+          workers.emplace_back(std::thread(ReadProcessor(index, opt, tc, *this, id, 0)));
+        }
+        for (int i = 0; i < opt.threads; i++) {
+          workers[i].join();
+        } 
+        id++;
+        assert(id == num_ids);
+      } else {
+        for (int i = 0; i < nt; i++,id++) {
+          tmp_bc[i].assign(tc.counts.size(), 0);
+          workers.emplace_back(std::thread(ReadProcessor(index, opt, tc, *this, id,i)));
+        }
+        for (int i = 0; i < nt; i++) {
+          workers[i].join();
+        }
       }
       
       if (!opt.umi) {
@@ -465,7 +534,7 @@ void MasterProcessor::processReads() {
           }
           int ec = tc.findEC(t.first);
           assert(ec != -1);
-          ++tmp_counts[ec];
+          tmp_counts[ec] += t.second;
         }
         auto& bc = batchCounts[id];
         for (int j = 0; j < tmp_counts.size(); j++) {
@@ -531,7 +600,7 @@ void MasterProcessor::processReads() {
   if (opt.pseudobam) {
     pseudobatchf_out.close();
   }
-  if (opt.bus_mode) {
+  if (opt.bus_mode || opt.batch_bus) {
     busf_out.close();
   }
 }
@@ -737,7 +806,7 @@ void MasterProcessor::update(const std::vector<int>& c, const std::vector<std::v
   // acquire the writer lock
   std::lock_guard<std::mutex> lock(this->writer_lock);
 
-  if (!opt.batch_mode) {
+  if (!opt.batch_mode || opt.batch_bus) {
     for (int i = 0; i < c.size(); i++) {
       tc.counts[i] += c[i]; // add up ec counts
       nummapped += c[i];
@@ -756,7 +825,7 @@ void MasterProcessor::update(const std::vector<int>& c, const std::vector<std::v
     }    
   }
 
-  if (!opt.batch_mode) {
+  if (!opt.batch_mode || opt.batch_bus) {
     for(auto &u : newEcs) {
       ++newECcount[u];
     }
@@ -829,7 +898,7 @@ void MasterProcessor::update(const std::vector<int>& c, const std::vector<std::v
     }
   }
 
-  if (opt.bus_mode) {
+  if (opt.bus_mode || opt.batch_bus) {
     int bus_bc_sum = 0;
     int bus_umi_sum = 0;
     for (int i = 0; i <= 32; i++) {
@@ -978,7 +1047,7 @@ void ReadProcessor::operator()() {
   while (true) {
     int readbatch_id;
     // grab the reader lock
-    if (mp.opt.batch_mode) {
+    if (mp.opt.batch_mode && !mp.opt.pseudo_read_files_supplied) {
       if (batchSR.empty()) {
         return;
       } else {
@@ -1294,12 +1363,23 @@ BUSProcessor::BUSProcessor(const KmerIndex& index, const ProgramOptions& opt, co
    // initialize buffer
    bufsize = mp.bufsize;
    buffer = new char[bufsize];  
+   if (opt.batch_mode) {
+     assert(id != -1);
+     batchSR.files = opt.batch_files[id];
+     batchSR.nfiles = opt.batch_files[id].size();
+     batchSR.reserveNfiles(opt.batch_files[id].size());
+     if (opt.umi) {
+       batchSR.umi_files = {opt.umi_files[id]};
+     }
+     batchSR.paired = !opt.single_end;
+   }
+
    seqs.reserve(bufsize/50);
    newEcs.reserve(1000);
    bv.reserve(1000);
    counts.reserve((int) (tc.counts.size() * 1.25));
-   memset(&bc_len[0],0,33);
-   memset(&umi_len[0],0,33);
+   memset(&bc_len[0],0,sizeof(bc_len));
+   memset(&umi_len[0],0,sizeof(umi_len));
 
    clear();
 }
@@ -1323,9 +1403,10 @@ BUSProcessor::BUSProcessor(BUSProcessor && o) :
   flens(std::move(o.flens)),
   bias5(std::move(o.bias5)),
   bv(std::move(o.bv)),  
+  batchSR(std::move(o.batchSR)),
   counts(std::move(o.counts)) {
-    memcpy(&bc_len[0], &o.bc_len[0], 33);
-    memcpy(&umi_len[0], &o.umi_len[0], 33);
+    memcpy(&bc_len[0], &o.bc_len[0], sizeof(bc_len));
+    memcpy(&umi_len[0], &o.umi_len[0], sizeof(umi_len));
     buffer = o.buffer;
     o.buffer = nullptr;
     o.bufsize = 0;
@@ -1339,22 +1420,42 @@ BUSProcessor::~BUSProcessor() {
 }
 
 void BUSProcessor::operator()() {
+  uint64_t parallel_bus_read_counter = 0;
+  std::unordered_set<int> parallel_bus_read_empty;
   while (true) {
     int readbatch_id;
+    std::vector<std::string> umis;
     // grab the reader lock
-    {
+    if (mp.opt.batch_mode && !mp.opt.pseudo_read_files_supplied) {
+      if (batchSR.empty()) {
+        return;
+      } else {
+        batchSR.fetchSequences(buffer, bufsize, seqs, names, quals, flags, umis, readbatch_id, mp.opt.pseudobam );
+      }
+    } else if (mp.opt.bus_mode && mp.parallel_bus_read) {
+      int nbatches = mp.opt.files.size() / mp.opt.busOptions.nfiles;
+      int i = parallel_bus_read_counter % nbatches;
+      if (parallel_bus_read_empty.size() >= nbatches) {
+        return;
+      }
+      parallel_bus_read_counter++;
+      std::lock_guard<std::mutex> lock(mp.parallel_bus_reader_locks[i]);
+      if (mp.FSRs[i].empty()) {
+        parallel_bus_read_empty.emplace(i);
+        continue;
+      }
+      mp.FSRs[i].fetchSequences(buffer, bufsize, seqs, names, quals, flags, umis, readbatch_id, mp.opt.pseudobam || mp.opt.fusion);
+    } else {
       std::lock_guard<std::mutex> lock(mp.reader_lock);
       if (mp.SR->empty()) {
         // nothing to do
         return;
       } else {
         // get new sequences
-        std::vector<std::string> umis;
-        mp.SR->fetchSequences(buffer, bufsize, seqs, names, quals, flags, umis, readbatch_id, false);
+        mp.SR->fetchSequences(buffer, bufsize, seqs, names, quals, flags, umis, readbatch_id, mp.opt.pseudobam || mp.opt.fusion);
       }
       // release the reader lock
     }
-    // do the same for BUS ?!?
     
     pseudobatch.aln.clear();
     pseudobatch.batch_id = readbatch_id;
@@ -1377,12 +1478,32 @@ void BUSProcessor::processBuffer() {
   
   u.reserve(1000);
   v.reserve(1000);
+  v2.reserve(1000);
   vtmp.reserve(1000);
 
-  memset(&bc_len[0], 0, 33);
-  memset(&umi_len[0], 0, 33);
+  memset(&bc_len[0], 0, sizeof(bc_len));
+  memset(&umi_len[0], 0, sizeof(umi_len));
 
   const BUSOptions& busopt = mp.opt.busOptions;
+  bool bulk_like = mp.opt.batch_bus || busopt.umi[0].fileno == -1; // Treat like bulk: no UMI
+  
+  auto &tcount = mp.tlencount;
+  if (mp.opt.batch_bus) {
+    tcount = mp.tlencounts[id];
+  }
+  bool findFragmentLength = tcount < 10000 && busopt.paired;
+  int flengoal = 0;
+  flens.clear();
+  if (findFragmentLength) {
+    flengoal = (10000 - tcount);
+    if (flengoal <= 0) {
+      findFragmentLength = false;
+      flengoal = 0;
+    } else {
+      flens.resize(tc.flens.size(), 0);
+    }
+  }
+
  
   char buffer[100];
   memset(buffer, 0, 100);
@@ -1409,6 +1530,15 @@ void BUSProcessor::processBuffer() {
 
   bool singleSeq = busopt.seq.size() ==1 ;
   const BUSOptionSubstr seqopt = busopt.seq.front();
+  bool check_tag_sequence = !mp.opt.tagsequence.empty();
+  int taglen = 0;
+  uint64_t tag_binary = 0;
+  int hamming_dist = 1;
+  if (check_tag_sequence) {
+    taglen = mp.opt.tagsequence.length();
+    uint32_t f = 0;
+    tag_binary = stringToBinary(mp.opt.tagsequence, f);
+  }
 
 
   //int incf = (bam) ? 1 : busopt.nfiles-1;
@@ -1421,42 +1551,97 @@ void BUSProcessor::processBuffer() {
     
     // find where the sequence is
     const char *seq = nullptr;
+    const char *seq2 = nullptr;
+    
+    bool ignore_umi = false;
+    
+    // copy the umi
+    int ulen = 0;
+    bool bad_umi = false;
+    if (bulk_like) {
+      ulen = 1;
+      umi[0] = 'A';
+      umi[ulen] = 0;
+      ignore_umi = true;
+    } else {
+      for (auto &umic : busopt.umi) {
+        int umilen = (0 == umic.stop) ? l[umic.fileno] - umic.start : umic.stop - umic.start;
+        if (l[umic.fileno] < umic.start + umilen || umilen <= 0) {
+          bad_umi = true;
+          break;
+        }
+        if (check_tag_sequence && ulen == 0) {
+          umilen += taglen; // Expand to include tag sequence
+          memcpy(umi, s[umic.fileno] + umic.start - taglen, umilen);
+        } else {
+          memcpy(umi+ulen, s[umic.fileno] + umic.start, umilen);
+        }
+        ulen += umilen;
+      }
+      if (bad_umi) {
+        continue;
+      }
+      umi[ulen] = 0;
+    }
+
+    uint64_t umi_binary = -1;
+    if (check_tag_sequence) {
+      uint32_t f = 0;
+      umi_binary = stringToBinary(umi, ulen, f);
+      if (hamming(tag_binary, umi_binary >> 2*(ulen-taglen), taglen) <= hamming_dist) { // if tag present
+        umi_binary = umi_binary & ~(~0ULL << (2*(ulen - taglen)));
+        ulen = ulen - taglen;
+        if (ulen <= 32) {
+          umi_len[ulen]++;
+        }
+      } else {
+        ignore_umi = true; // tag not present, it's not a UMI-read
+        umi_binary = -1;
+      }
+    } else {
+      if (ulen <= 32) {
+        umi_len[ulen]++;
+      }
+    }
     
     size_t seqlen = 0;
+    size_t seqlen2 = 0;
     if (singleSeq) {
-      seq = s[seqopt.fileno] + seqopt.start;
-      seqlen = l[seqopt.fileno];
+      int seqstart = (ignore_umi && busopt.umi[0].fileno == seqopt.fileno ? busopt.umi[0].start - taglen : seqopt.start);
+      seq = s[seqopt.fileno] + seqstart;
+      seqlen = (seqopt.stop == 0) ? l[seqopt.fileno]-seqstart : seqopt.stop - seqstart;
+    } else if (busopt.paired) {
+      const auto &sopt1 = busopt.seq[0];
+      const auto &sopt2 = busopt.seq[1];
+      int seqstart1 = (ignore_umi && busopt.umi[0].fileno == sopt1.fileno ? busopt.umi[0].start - taglen : sopt1.start);
+      int seqstart2 = (ignore_umi && busopt.umi[0].fileno == sopt2.fileno ? busopt.umi[0].start - taglen : sopt2.start);
+      int cplen1 = (sopt1.stop == 0) ? l[sopt1.fileno]-seqstart1 : sopt1.stop - seqstart1;
+      int cplen2 = (sopt2.stop == 0) ? l[sopt2.fileno]-seqstart2 : sopt2.stop - seqstart2;
+      seq = s[sopt1.fileno] + seqstart1;
+      seq2 = s[sopt2.fileno] + seqstart2;
+      seqlen = cplen1;
+      seqlen2 = cplen2;
     } else {
       seqbuffer.clear();
       for (int j = 0; j < busopt.seq.size(); j++) {
         const auto &sopt = busopt.seq[j];
-        int cplen = (sopt.stop == 0) ? l[sopt.fileno]-sopt.start : sopt.stop - sopt.start;
-        seqbuffer.append(s[sopt.fileno] + sopt.start, cplen);
-        seqbuffer.push_back('N');        
+        int seqstart = (ignore_umi && busopt.umi[0].fileno == sopt.fileno ? busopt.umi[0].start - taglen : sopt.start);
+        int cplen = (sopt.stop == 0) ? l[sopt.fileno]-seqstart : sopt.stop - seqstart;
+        seqbuffer.append(s[sopt.fileno] + seqstart, cplen);
+        seqbuffer.push_back('N');
       }
       seqbuffer.push_back(0);
       seq = seqbuffer.c_str();
       seqlen = seqbuffer.size();
     }
 
-    // copy the umi
-    int umilen = (busopt.umi.start == busopt.umi.stop) ? l[busopt.umi.fileno] - busopt.umi.start : busopt.umi.stop - busopt.umi.start;
-    if (l[busopt.umi.fileno] < busopt.umi.start + umilen) {
-      continue; // too short
-    }
-    memcpy(umi, s[busopt.umi.fileno] + busopt.umi.start, umilen);
-    umi[umilen] = 0;
-    if (umilen >= 0 && umilen <= 32) {
-      umi_len[umilen]++;
-    }
-
     
 //    auto &bcc = busopt.bc[0];
     int blen = 0;
     bool bad_bc = false;
     for (auto &bcc : busopt.bc) {
-      int bclen = (bcc.start == bcc.stop) ? l[bcc.fileno] - bcc.start : bcc.stop - bcc.start;
-      if (l[bcc.fileno] < bcc.start + bclen) {
+      int bclen = (0 == bcc.stop) ? l[bcc.fileno] - bcc.start : bcc.stop - bcc.start;
+      if (l[bcc.fileno] < bcc.start + bclen || bclen <= 0) {
         bad_bc = true;
         break;
       }
@@ -1466,6 +1651,11 @@ void BUSProcessor::processBuffer() {
     if (bad_bc) {
       continue;
     }
+    if (mp.opt.batch_bus) {
+      ignore_umi = true;
+      blen = BUSFORMAT_FAKE_BARCODE_LEN;
+      memcpy(bc, binaryToString(id, blen).c_str(), blen); // Create fake barcode that identifies the batch
+    }
     bc[blen] = 0;
 
     if (blen >= 0 && blen <= 32) {
@@ -1483,14 +1673,74 @@ void BUSProcessor::processBuffer() {
 
     // process 2nd read
     index.match(seq,seqlen, v);
+    if (busopt.paired) {
+      v2.clear();
+      index.match(seq2,seqlen2, v2);
+    }
 
     // collect the target information
     int ec = -1;
-    int r = tc.intersectKmers(v, v2, false,u);
+    int r = tc.intersectKmers(v, v2, !busopt.paired,u);
     if (!u.empty()) {      
       ec = tc.findEC(u);
     }
 
+    if ((!ignore_umi || bulk_like) && mp.opt.strand_specific && !u.empty()) { // Strand-specificity
+      int p = -1;
+      Kmer km;
+      KmerEntry val;
+      if (!v.empty()) {
+        vtmp.clear();
+        bool firstStrand = (mp.opt.strand == ProgramOptions::StrandType::FR); // FR have first read mapping forward
+        p = findFirstMappingKmer(v,val);
+        km = Kmer((seq+p));
+        bool strand = (val.isFw() == (km == km.rep())); // k-mer maps to fw strand?
+        // might need to optimize this
+        const auto &c = index.dbGraph.contigs[val.contig];
+        for (auto tr : u) {
+          for (auto ctx : c.transcripts) {
+            if (tr == ctx.trid) {
+              if ((strand == ctx.sense) == firstStrand) {
+                // swap out 
+                vtmp.push_back(tr);
+              } 
+              break;
+            }
+          }          
+        }
+        if (vtmp.size() < u.size()) {
+          u = vtmp; // copy
+        }
+      }
+      if (!v2.empty()) {
+        vtmp.clear();
+        bool secondStrand = (mp.opt.strand == ProgramOptions::StrandType::RF);
+        p = findFirstMappingKmer(v2,val);
+        km = Kmer((seq2+p));
+        bool strand = (val.isFw() == (km == km.rep())); // k-mer maps to fw strand?
+        // might need to optimize this
+        const auto &c = index.dbGraph.contigs[val.contig];
+        for (auto tr : u) {
+          for (auto ctx : c.transcripts) {
+            if (tr == ctx.trid) {
+              if ((strand == ctx.sense) == secondStrand) {
+                // swap out 
+                vtmp.push_back(tr);
+              } 
+              break;
+            }
+          }          
+        }
+        if (vtmp.size() < u.size()) {
+          u = vtmp; // copy
+        }
+      }
+      ec = -1;
+      if (!u.empty()) { // update the ec
+        ec = tc.findEC(u);
+      }
+    }
+
     // find the ec
     if (!u.empty()) {
       BUSData b;      
@@ -1498,15 +1748,26 @@ void BUSProcessor::processBuffer() {
       b.flags = 0;
       b.barcode = stringToBinary(bc, blen, f);
       b.flags |= f;
-      b.UMI = stringToBinary(umi, umilen, f);
+      b.UMI = check_tag_sequence || bulk_like ? umi_binary : stringToBinary(umi, ulen, f);
       b.flags |= (f) << 8;
       b.count = 1;
       //std::cout << std::string(s1,10)  << "\t" << b.barcode << "\t" << std::string(s1+10,16) << "\t" << b.UMI << "\n";
       if (num) {
         b.flags = (uint32_t) flags[i / jmax];
       }
-
+      
       //ec = tc.findEC(u);
+
+      if (busopt.paired && ignore_umi) {
+        if (findFragmentLength && flengoal > 0 && 0 <= ec && ec < index.num_trans && !v.empty() && !v2.empty()) {
+          // try to map the reads
+          int tl = index.mapPair(seq, seqlen, seq2, seqlen2, ec);
+          if (0 < tl && tl < flens.size()) {
+            flens[tl]++;
+            flengoal--;
+          }
+        }
+      }
       
       // count the pseudoalignment
       if (ec == -1 || ec >= counts.size()) {
@@ -1524,16 +1785,20 @@ void BUSProcessor::processBuffer() {
 
     if (mp.opt.pseudobam) {
       PseudoAlignmentInfo info;
-      info.id = i;
-      info.paired = false;
+      info.id = i / jmax;
+      info.paired = busopt.paired;
       uint32_t f = 0;
       info.barcode = stringToBinary(bc, blen, f);
-      info.UMI = stringToBinary(umi, umilen, f);
+      info.UMI = check_tag_sequence ? umi_binary : stringToBinary(umi, ulen, f);
       if (!u.empty()) {
         info.r1empty = v.empty();
         KmerEntry val;
         info.k1pos = findFirstMappingKmer(v,val);
         info.k2pos = -1;
+        if (busopt.paired) {
+          info.r2empty = v2.empty();
+          info.k2pos = findFirstMappingKmer(v2,val);
+        }
                 
         if (ec != -1) {
           info.ec_id = ec;
@@ -1670,7 +1935,11 @@ void AlnProcessor::operator()() {
         mp.SR->fetchSequences(buffer, bufsize, seqs, names, quals, flags, umis, readbatch_id, true);
         readPseudoAlignmentBatch(mp.pseudobatchf_in, pseudobatch);
         assert(pseudobatch.batch_id == readbatch_id);
-        assert(pseudobatch.aln.size() == ((paired) ? seqs.size()/2 : seqs.size())); // sanity checks
+        if (mp.opt.bus_mode) {
+          assert(pseudobatch.aln.size() == seqs.size()/mp.opt.busOptions.nfiles); // sanity checks
+        } else {
+          assert(pseudobatch.aln.size() == ((paired) ? seqs.size()/2 : seqs.size())); // sanity checks
+        }
       }
       // release the reader lock
     }
@@ -1689,7 +1958,6 @@ void AlnProcessor::operator()() {
 
 
 void AlnProcessor::processBufferTrans() {
-
   /* something simple where we can construct the bam records */
   std::vector<bam1_t> bv;
 
@@ -2091,7 +2359,6 @@ void AlnProcessor::processBufferTrans() {
 
 
 void AlnProcessor::processBufferGenome() {
-
   /* something simple where we can construct the bam records */
   std::vector<bam1_t> bv;
   int idnum = 0;
@@ -2110,14 +2377,21 @@ void AlnProcessor::processBufferGenome() {
     genome_auxlen -= 7;
   }
   if (mp.opt.bus_mode) {
-    //todo replace with what is written to busfile
     bclen = mp.opt.busOptions.getBCLength();
     if (bclen == 0) {
-      bclen = 32;
+      for (int i = 0; i <= 32; i++) {
+        if (mp.bus_bc_len[i] > mp.bus_bc_len[bclen]) {
+          bclen = i;
+        }
+      }
     }
     umilen = mp.opt.busOptions.getUMILength();
     if (umilen == 0) {
-      umilen = 32;
+      for (int i = 0; i <= 32; i++) {
+        if (mp.bus_umi_len[i] > mp.bus_umi_len[umilen]) {
+          umilen = i;
+        }
+      }
     }
 
     genome_auxlen += (bclen + umilen) + 8;
@@ -2129,28 +2403,31 @@ void AlnProcessor::processBufferGenome() {
   KmerEntry val1, val2;
   if  (mp.opt.bus_mode) {
     paired = false;
+    if (mp.opt.busOptions.paired) {
+      paired = true;
+    }
   }
-  bool readpaired = paired || (mp.opt.bus_mode && mp.opt.busOptions.nfiles == 2);
-  
-  
 
   for (int i = 0; i < n; i++) {
     bam1_t b1,b2, b1c, b2c;
-    int si1 = (readpaired) ? 2*i : i;
-    int si2 = (readpaired) ? 2*i +1 : -1;
-    
+    int si1 = (paired) ? 2*i : i;
+    int si2 = (paired) ? 2*i +1 : -1;
+    // For BUS file processing with paired-end reads:
+    if (mp.opt.bus_mode) {
+      const BUSOptions& busopt = mp.opt.busOptions;
+      si1 = i*busopt.nfiles + busopt.seq[0].fileno;
+      if (paired) {
+        si2 = i*busopt.nfiles + busopt.seq[1].fileno;
+      }
+    }
     int rlen1 = seqs[si1].second;
     int rlen2;
-    if (readpaired) {
+    int seq1_offset = 0;
+    int seq2_offset = 0;
+    if (paired) {
       rlen2 = seqs[si2].second;
     }
     
-    if (mp.opt.busOptions.seq.front().fileno == 1) {
-      std::swap(si1,si2);
-      std::swap(rlen1,rlen2);
-    }
-    
-
     // fill in the bam core info
     b1.core.tid = -1;
     b1.core.pos = -1;
@@ -2176,16 +2453,42 @@ void AlnProcessor::processBufferGenome() {
     }
 
     PseudoAlignmentInfo &pi = pseudobatch.aln[i];
-
-    // fill in the data info
-    fillBamRecord(b1, nullptr, seqs[si1].first, names[si1].first,  quals[si1].first, seqs[si1].second, names[si1].second, pi.r1empty, genome_auxlen);
-    //b1.id = idnum++;
-    if (paired) {
-      fillBamRecord(b2, nullptr, seqs[si2].first, names[si2].first,  quals[si2].first, seqs[si2].second, names[si2].second, pi.r2empty, genome_auxlen);
-      // b2.id = idnum++;
-    }
     
-    if (mp.opt.bus_mode) {
+    if (!mp.opt.bus_mode) {
+      // fill in the data info
+      fillBamRecord(b1, nullptr, seqs[si1].first, names[si1].first,  quals[si1].first, seqs[si1].second, names[si1].second, pi.r1empty, genome_auxlen);
+      //b1.id = idnum++;
+      if (paired) {
+        fillBamRecord(b2, nullptr, seqs[si2].first, names[si2].first,  quals[si2].first, seqs[si2].second, names[si2].second, pi.r2empty, genome_auxlen);
+        // b2.id = idnum++;
+      }
+    }
+    else {
+      bool tag_present = !mp.opt.tagsequence.empty() && pi.UMI != -1; // tag+UMI present in the sequence
+      bool umi_first_file = mp.opt.busOptions.umi[0].fileno == mp.opt.busOptions.seq[0].fileno; // UMI-containing file is the first file (file number 0)
+      int tagseqstart;
+      if (tag_present) {
+        tagseqstart = umi_first_file ? mp.opt.busOptions.seq[0].start : mp.opt.busOptions.seq[1].start; // Position where the actual sequence begins when tag+UMI present
+      }
+
+      // fill in the data info
+      if (tag_present && umi_first_file) {
+        rlen1 = seqs[si1].second-tagseqstart;
+        seq1_offset = tagseqstart;
+        fillBamRecord(b1, nullptr, seqs[si1].first+tagseqstart, names[si1].first,  quals[si1].first+tagseqstart, seqs[si1].second-tagseqstart, names[si1].second, pi.r1empty, genome_auxlen);
+      } else {
+        fillBamRecord(b1, nullptr, seqs[si1].first, names[si1].first,  quals[si1].first, seqs[si1].second, names[si1].second, pi.r1empty, genome_auxlen);
+      }
+      if (paired) {
+        if (tag_present && !umi_first_file) {
+          rlen2 = seqs[si2].second-tagseqstart;
+          seq2_offset = tagseqstart;
+          fillBamRecord(b2, nullptr, seqs[si2].first+tagseqstart, names[si2].first,  quals[si2].first+tagseqstart, seqs[si2].second-tagseqstart, names[si2].second, pi.r2empty, genome_auxlen);
+        } else {
+          fillBamRecord(b2, nullptr, seqs[si2].first, names[si2].first,  quals[si2].first, seqs[si2].second, names[si2].second, pi.r2empty, genome_auxlen);
+        }
+      }
+
       b1.data[b1.l_data] = 'C';
       b1.data[b1.l_data+1] = 'R';
       b1.data[b1.l_data+2] = 'Z';
@@ -2196,9 +2499,25 @@ void AlnProcessor::processBufferGenome() {
       b1.data[b1.l_data] = 'U';
       b1.data[b1.l_data+1] = 'R';
       b1.data[b1.l_data+2] = 'Z';
-      umi = binaryToString(pi.UMI, umilen);
+      if (!mp.opt.tagsequence.empty() && pi.UMI == -1) { // Non-UMI read
+        umi = std::string(umilen, 'N');
+      } else {
+        umi = binaryToString(pi.UMI, umilen);
+      }
       memcpy(b1.data + b1.l_data + 3, umi.c_str(), umilen+2);
       b1.l_data += umilen + 4;
+      if (paired) {
+        b2.data[b2.l_data] = 'C';
+        b2.data[b2.l_data+1] = 'R';
+        b2.data[b2.l_data+2] = 'Z';
+        memcpy(b2.data + b2.l_data + 3, bc.c_str(), bclen+1);
+        b2.l_data += bclen + 4;
+        b2.data[b2.l_data] = 'U';
+        b2.data[b2.l_data+1] = 'R';
+        b2.data[b2.l_data+2] = 'Z';
+        memcpy(b2.data + b2.l_data + 3, umi.c_str(), umilen+2);
+        b2.l_data += umilen + 4;
+      }
     }
 
     if (pi.r1empty && pi.r2empty) {
@@ -2334,11 +2653,11 @@ void AlnProcessor::processBufferGenome() {
         std::pair<bool,bool> strInfo1 = {true,true}, strInfo2 = {true,true};
         
         if (!pi.r1empty) {
-          km1 = Kmer(seqs[si1].first + pi.k1pos);
+          km1 = Kmer(seqs[si1].first + seq1_offset + pi.k1pos);
           strInfo1 = strandednessInfo(km1, val1, ua);
         }
         if (paired && !pi.r2empty) {
-          km2 = Kmer(seqs[si2].first + pi.k2pos);
+          km2 = Kmer(seqs[si2].first + seq2_offset + pi.k2pos);
           strInfo2 = strandednessInfo(km2, val2, ua);
         }
         
@@ -2459,7 +2778,7 @@ void AlnProcessor::processBufferGenome() {
           b1c.core.tid = tra.first.chr;
           if (!pi.r1empty) {
             b1c.core.pos = tra.first.chrpos;
-            b1c.core.bin = hts_reg2bin(b1c.core.pos, b1c.core.pos + seqs[si1].second-1, 14, 5);
+            b1c.core.bin = hts_reg2bin(b1c.core.pos, b1c.core.pos + seqs[si1].second-seq1_offset-1, 14, 5);
             b1c.core.qual = 255;
             if (useEM) {
               memcpy(b1c.data + b1c.l_data - 4, &prob, 4); // set ZW tag
@@ -2470,7 +2789,7 @@ void AlnProcessor::processBufferGenome() {
             b2c.core.tid = tra.second.chr;
             if (!pi.r2empty) {
               b2c.core.pos = tra.second.chrpos;
-              b2c.core.bin = hts_reg2bin(b2c.core.pos, b2c.core.pos + seqs[si2].second, 14, 5);
+              b2c.core.bin = hts_reg2bin(b2c.core.pos, b2c.core.pos + seqs[si2].second-seq2_offset, 14, 5);
               b2c.core.qual = 255;
               if (useEM) {
                 memcpy(b2c.data + b2c.l_data - 4, &prob, 4);
@@ -2914,7 +3233,7 @@ bool FastqSequenceReader::fetchSequences(char *buf, const int limit, std::vector
         }
 
         numreads++;
-        flags.push_back(numreads);
+        flags.push_back(numreads-1);
       } else {
         return true; // read it next time
       }


=====================================
src/ProcessReads.h
=====================================
@@ -179,8 +179,6 @@ public:
       }
 
       if (opt.batch_mode) {
-        memset(&bus_bc_len[0],0,33);
-        memset(&bus_umi_len[0],0,33);
         batchCounts.assign(opt.batch_ids.size(), {});
         tlencounts.assign(opt.batch_ids.size(), 0);
         batchFlens.assign(opt.batch_ids.size(), std::vector<int>(1000,0));
@@ -198,10 +196,16 @@ public:
       if (opt.pseudobam) {
         pseudobatchf_out.open(opt.output + "/pseudoaln.bin", std::ios::out | std::ios::binary);
       }
-      if (opt.bus_mode) {
+      if (opt.bus_mode || opt.batch_bus) {
+        memset(&bus_bc_len[0],0,sizeof(bus_bc_len));
+        memset(&bus_umi_len[0],0,sizeof(bus_umi_len));
         busf_out.open(opt.output + "/output.bus", std::ios::out | std::ios::binary);
         
-        writeBUSHeader(busf_out, opt.busOptions.getBCLength(), opt.busOptions.getUMILength());
+        if (opt.batch_bus) {
+          writeBUSHeader(busf_out, BUSFORMAT_FAKE_BARCODE_LEN, 1);
+        } else {
+          writeBUSHeader(busf_out, opt.busOptions.getBCLength(), opt.busOptions.getUMILength());
+        }
       }
     }
 
@@ -227,10 +231,13 @@ public:
   }
 
   std::mutex reader_lock;
+  std::vector<std::mutex> parallel_bus_reader_locks;
+  bool parallel_bus_read;
   std::mutex writer_lock;
 
 
   SequenceReader *SR;
+  std::vector<FastqSequenceReader> FSRs;
   MinCollector& tc;
   KmerIndex& index;
   const Transcriptome& model;
@@ -335,6 +342,7 @@ public:
   int id;
   int local_id;
   PseudoAlignmentBatch pseudobatch;
+  FastqSequenceReader batchSR;
 
   int bc_len[33];
   int umi_len[33];


=====================================
src/common.h
=====================================
@@ -1,7 +1,7 @@
 #ifndef KALLISTO_COMMON_H
 #define KALLISTO_COMMON_H
 
-#define KALLISTO_VERSION "0.46.2"
+#define KALLISTO_VERSION "0.48.0"
 
 #include <string>
 #include <vector>
@@ -22,9 +22,11 @@ struct BUSOptionSubstr {
 struct BUSOptions {
   int nfiles;
   
-  BUSOptionSubstr umi;
+  std::vector<BUSOptionSubstr> umi;
   std::vector<BUSOptionSubstr> bc;
   std::vector<BUSOptionSubstr> seq;
+  
+  bool paired;
 
   int getBCLength() const {
     int r =0 ;
@@ -32,6 +34,8 @@ struct BUSOptions {
       for (auto& b : bc) {
         if (b.start < 0) {
           return 0;
+        } else if (b.stop == 0) {
+          return 0;
         } else {
           r += b.stop - b.start;
         }
@@ -41,11 +45,19 @@ struct BUSOptions {
   }
 
   int getUMILength() const {
-    if (umi.start >= 0) {
-      return umi.stop - umi.start;
-    } else {
-      return 0;
+    int r =0 ;
+    if (!umi.empty()) {
+      for (auto& u : umi) {
+        if (u.start < 0) {
+          return 0;
+        } else if (u.stop == 0) {
+          return 0;
+        } else {
+          r += u.stop - u.start;
+        }
+      }
     }
+    return r;
   }
 };
 
@@ -64,6 +76,7 @@ struct ProgramOptions {
   int bootstrap;
   std::vector<std::string> transfasta;
   bool batch_mode;
+  bool pseudo_read_files_supplied;
   bool bus_mode;
   BUSOptions busOptions;
   bool pseudo_quant;
@@ -87,6 +100,8 @@ struct ProgramOptions {
   enum class StrandType {None, FR, RF};
   StrandType strand;
   bool umi;
+  bool batch_bus_write;
+  bool batch_bus;
   std::string gfa; // used for inspect
   bool inspect_thorough;
   bool single_overhang;
@@ -94,6 +109,12 @@ struct ProgramOptions {
   std::string chromFile;
   std::string bedFile;
   std::string technology;
+  std::string tagsequence;
+  std::string tccFile;
+  std::string ecFile;
+  std::string fldFile;
+  std::string transcriptsFile;
+  std::string genemap;
 
 ProgramOptions() :
   verbose(false),
@@ -107,6 +128,7 @@ ProgramOptions() :
   min_range(1),
   bootstrap(0),
   batch_mode(false),
+  pseudo_read_files_supplied(false),
   bus_mode(false),
   pseudo_quant(false),
   bam(false),
@@ -123,6 +145,8 @@ ProgramOptions() :
   fusion(false),
   strand(StrandType::None),
   umi(false),
+  batch_bus_write(false),
+  batch_bus(false),
   inspect_thorough(false),
   single_overhang(false)
   {}


=====================================
src/main.cpp
=====================================
@@ -8,6 +8,7 @@
 #include <thread>
 #include <time.h>
 #include <algorithm>
+#include <limits>
 
 #include <cstdio>
 
@@ -379,23 +380,118 @@ void ParseOptionsEMOnly(int argc, char **argv, ProgramOptions& opt) {
   }
 }
 
+void ParseOptionsTCCQuant(int argc, char **argv, ProgramOptions& opt) {
+  const char *opt_string = "o:i:T:e:f:l:s:t:g:G:b:d:";
+  static struct option long_options[] = {
+    {"index", required_argument, 0, 'i'},
+    {"txnames", required_argument, 0, 'T'},
+    {"threads", required_argument, 0, 't'},
+    {"fragment-file", required_argument, 0, 'f'},
+    {"fragment-length", required_argument, 0, 'l'},
+    {"sd", required_argument, 0, 's'},
+    {"output-dir", required_argument, 0, 'o'},
+    {"ec-file", required_argument, 0, 'e'},
+    {"genemap", required_argument, 0, 'g'},
+    {"gtf", required_argument, 0, 'G'},
+    {"bootstrap-samples", required_argument, 0, 'b'},
+    {"seed", required_argument, 0, 'd'},
+    {0,0,0,0}
+  };
+  int c;
+  int option_index = 0;
+  while (true) {
+    c = getopt_long(argc,argv,opt_string, long_options, &option_index);
+    
+    if (c == -1) {
+      break;
+    }
+    
+    switch (c) {
+    case 0:
+      break;
+    case 't': {
+      stringstream(optarg) >> opt.threads;
+      break;
+    }
+    case 'f': {
+      stringstream(optarg) >> opt.fldFile;
+      break;
+    }
+    case 'l': {
+      stringstream(optarg) >> opt.fld;
+      break;
+    }
+    case 's': {
+      stringstream(optarg) >> opt.sd;
+      break;
+    }
+    case 'o': {
+      opt.output = optarg;
+      break;
+    }
+    case 'i': {
+      opt.index = optarg;
+      break;
+    }
+    case 'e': {
+      opt.ecFile = optarg;
+      break;
+    }
+    case 'g': {
+      opt.genemap = optarg;
+      break;
+    }
+    case 'G': {
+      opt.gtfFile = optarg;
+      break;
+    }
+    case 'b': {
+      stringstream(optarg) >> opt.bootstrap;
+      break;
+    }
+    case 'd': {
+      stringstream(optarg) >> opt.seed;
+      break;
+    }
+    case 'T': {
+      opt.transcriptsFile = optarg;
+      break;
+    }
+    default: break;
+    }
+  }
+  for (int i = optind; i < argc; i++) {
+    opt.files.push_back(argv[i]);
+  }
+  if (opt.files.size() > 0) {
+    opt.tccFile = opt.files[0];
+  }
+}
+
 void ParseOptionsPseudo(int argc, char **argv, ProgramOptions& opt) {
   int verbose_flag = 0;
   int single_flag = 0;
   int strand_flag = 0;
+  int strand_FR_flag = 0;
+  int strand_RF_flag = 0;
   int pbam_flag = 0;
   int gbam_flag = 0;
   int umi_flag = 0;
   int quant_flag = 0;
+  int bus_flag = 0;
 
-  const char *opt_string = "t:i:l:s:o:b:u:g:";
+  const char *opt_string = "t:i:l:s:o:b:u:g:n";
   static struct option long_options[] = {
     // long args
     {"verbose", no_argument, &verbose_flag, 1},
     {"single", no_argument, &single_flag, 1},
     //{"strand-specific", no_argument, &strand_flag, 1},
+    {"fr-stranded", no_argument, &strand_FR_flag, 1},
+    {"rf-stranded", no_argument, &strand_RF_flag, 1},
     {"pseudobam", no_argument, &pbam_flag, 1},
     {"quant", no_argument, &quant_flag, 1},
+    {"bus", no_argument, &bus_flag, 1},
+    {"num", no_argument, 0, 'n'},
     {"umi", no_argument, &umi_flag, 'u'},
     {"batch", required_argument, 0, 'b'},
     // short args
@@ -435,6 +531,10 @@ void ParseOptionsPseudo(int argc, char **argv, ProgramOptions& opt) {
       stringstream(optarg) >> opt.sd;
       break;
     }
+    case 'n': {
+      opt.num = true;
+      break;
+    }
     case 'o': {
       opt.output = optarg;
       break;
@@ -456,6 +556,14 @@ void ParseOptionsPseudo(int argc, char **argv, ProgramOptions& opt) {
     opt.umi = true;
     opt.single_end = true; // UMI implies single-end reads
   }
+  
+  if (bus_flag) {
+    if (!opt.num) {
+      opt.batch_bus_write = true;
+    } else {
+      opt.batch_bus = true;
+    }
+  }
 
   // all other arguments are fast[a/q] files to be read
   for (int i = optind; i < argc; i++) {
@@ -478,6 +586,16 @@ void ParseOptionsPseudo(int argc, char **argv, ProgramOptions& opt) {
   if (strand_flag) {
     opt.strand_specific = true;
   }
+  
+  if (strand_FR_flag) {
+    opt.strand_specific = true;
+    opt.strand = ProgramOptions::StrandType::FR;
+  }
+  
+  if (strand_RF_flag) {
+    opt.strand_specific = true;
+    opt.strand = ProgramOptions::StrandType::RF;
+  }
 
   if (pbam_flag) {
     opt.pseudobam = true;
@@ -532,6 +650,8 @@ void ListSingleCellTechnologies() {
   << "10xv1            10x version 1 chemistry" << endl
   << "10xv2            10x version 2 chemistry" << endl
   << "10xv3            10x version 3 chemistry" << endl
+  << "Bulk             Bulk RNA-seq or Smart-seq2 (multiplexed)" << endl
+  << "BDWTA            BD Rhapsody WTA" << endl
   << "CELSeq           CEL-Seq" << endl
   << "CELSeq2          CEL-Seq version 2" << endl
   << "DropSeq          DropSeq" << endl
@@ -539,27 +659,40 @@ void ListSingleCellTechnologies() {
   << "inDropsv2        inDrops version 2 chemistry" << endl
   << "inDropsv3        inDrops version 3 chemistry" << endl
   << "SCRBSeq          SCRB-Seq" << endl
+  << "SmartSeq3        Smart-seq3" << endl
+  << "SPLiT-seq        SPLiT-seq" << endl
   << "SureCell         SureCell for ddSEQ" << endl
+  << "Visium           10x Visium Spatial Transcriptomics" << endl
   << endl;
  }
 
 void ParseOptionsBus(int argc, char **argv, ProgramOptions& opt) {
   int verbose_flag = 0;
   int gbam_flag = 0;
+  int paired_end_flag = 0;
+  int strand_FR_flag = 0;
+  int strand_RF_flag = 0;
+  int unstranded_flag = 0;
 
-  const char *opt_string = "i:o:x:t:lbng:c:";
+  const char *opt_string = "i:o:x:t:lbng:c:T:B:";
   static struct option long_options[] = {
     {"verbose", no_argument, &verbose_flag, 1},
     {"index", required_argument, 0, 'i'},
     {"output-dir", required_argument, 0, 'o'},
     {"technology", required_argument, 0, 'x'},
     {"list", no_argument, 0, 'l'},
+    {"batch", required_argument, 0, 'B'},
     {"threads", required_argument, 0, 't'},
     {"bam", no_argument, 0, 'b'},
     {"num", no_argument, 0, 'n'},
     {"genomebam", no_argument, &gbam_flag, 1},
     {"gtf", required_argument, 0, 'g'},
     {"chromosomes", required_argument, 0, 'c'},
+    {"tag", required_argument, 0, 'T'},
+    {"fr-stranded", no_argument, &strand_FR_flag, 1},
+    {"rf-stranded", no_argument, &strand_RF_flag, 1},
+    {"unstranded", no_argument, &unstranded_flag, 1},
+    {"paired", no_argument, &paired_end_flag, 1},
     {0,0,0,0}
   };
 
@@ -601,6 +734,11 @@ void ParseOptionsBus(int argc, char **argv, ProgramOptions& opt) {
       opt.bam = true;
       break;
     }
+    case 'B': {
+      opt.batch_mode = true;
+      opt.batch_file_name = optarg;
+      break;
+    }
     case 'n': {
       opt.num = true;
       break;
@@ -613,6 +751,10 @@ void ParseOptionsBus(int argc, char **argv, ProgramOptions& opt) {
       stringstream(optarg) >> opt.chromFile;
       break;
     }
+    case 'T': {
+      stringstream(optarg) >> opt.tagsequence;
+      break;
+    }
     default: break;
     }
   }
@@ -631,6 +773,26 @@ void ParseOptionsBus(int argc, char **argv, ProgramOptions& opt) {
     opt.genomebam = true;    
   }
 
+  if (strand_FR_flag) {
+    opt.strand_specific = true;
+    opt.strand = ProgramOptions::StrandType::FR;
+  }
+
+  if (strand_RF_flag) {
+    opt.strand_specific = true;
+    opt.strand = ProgramOptions::StrandType::RF;
+  }
+  
+  if (unstranded_flag) {
+    opt.strand_specific = true;
+    opt.strand = ProgramOptions::StrandType::None;
+  }
+
+  if (paired_end_flag) {
+    opt.single_end = false;
+  } else {
+    opt.single_end = true;
+  }
   
   // all other arguments are fast[a/q] files to be read
   for (int i = optind; i < argc; i++) {
@@ -726,11 +888,7 @@ bool ParseTechnology(const std::string &techstr, BUSOptions& busopt, std::vector
     errorList.push_back("Error: empty UMI list " + umistr);
     return false;
   }
-  if (v.size() != 1) {
-    errorList.push_back("Error: only a single UMI list allow " + umistr);
-    return false;
-  }
-  busopt.umi = std::move(v.front());
+  busopt.umi = std::move(v);
 
 
   if (!convert_commas_to_vector(seqstr, v)) {
@@ -808,7 +966,7 @@ bool CheckOptionsBus(ProgramOptions& opt) {
   }
 
   // check files
-  if (opt.files.size() == 0) {
+  if (opt.files.size() == 0 && !opt.batch_mode) {
     cerr << ERROR_STR << " Missing read files" << endl;
     ret = false;
   } else {
@@ -853,67 +1011,264 @@ bool CheckOptionsBus(ProgramOptions& opt) {
            << ", but only " << n << " cores on the machine" << endl;
     }    
   }
-
-  if (opt.technology.empty()) {
-    cerr << "Error: need to specify technology to use" << endl;
+  if (!opt.technology.empty() && opt.batch_mode) {
+    cerr << "Error: cannot specify both -x and --batch" << endl;
     ret = false;
+  }
+
+  ProgramOptions::StrandType strand = ProgramOptions::StrandType::None;
+
+  if (opt.technology.empty()) { // kallisto pseudo
+    if (!opt.num) {
+      opt.batch_bus_write = true;
+    } else {
+      opt.batch_bus = true;
+    }
+    // check for read files
+    if (!opt.batch_mode) {
+      cerr << "[bus] no technology specified; will try running read files as-is" << endl;
+      opt.batch_mode = true;
+      if (!opt.single_end && opt.files.size() % 2 != 0) {
+        cerr << "Error: paired-end mode requires an even number of input files" << endl;
+        ret = false;
+      } else {
+        int i = 0;
+        int sample_id = 0;
+        while (i < opt.files.size()) {
+          opt.batch_ids.push_back("batch" + std::to_string(sample_id));
+          std::string f1,f2;
+          struct stat stFileInfo;
+          f1 = opt.files[i];
+          if (opt.single_end) {
+            opt.batch_files.push_back({f1});
+            auto intstat = stat(f1.c_str(), &stFileInfo);
+            if (intstat != 0) {
+              cerr << ERROR_STR << " file not found " << f1 << endl;
+              ret = false;
+            }
+          } else {
+            i++;
+            f2 = opt.files[i];
+            opt.batch_files.push_back({f1,f2});
+            auto intstat = stat(f1.c_str(), &stFileInfo);
+            if (intstat != 0) {
+              cerr << ERROR_STR << " file not found " << f1 << endl;
+              ret = false;
+            }
+            intstat = stat(f2.c_str(), &stFileInfo);
+            if (intstat != 0) {
+              cerr << ERROR_STR << " file not found " << f2 << endl;
+              ret = false;
+            }
+          }
+          i++;
+          sample_id++;
+        }
+        if (sample_id == 1) {
+          opt.pseudo_read_files_supplied = true; // Only one sample supplied; use SR instead of batchSR to read
+        }
+      }
+    } else if (ret) {
+      cerr << "[bus] no technology specified; will try running read files supplied in batch file" << endl;
+      if (!opt.single_end) {
+        cerr << "[bus] --paired ignored; single/paired-end is inferred from number of files supplied" << endl;
+      }
+      if (opt.files.size() != 0) {
+        cerr << ERROR_STR << " cannot specify batch mode and supply read files" << endl;
+        ret = false;
+      } else {
+        // check for batch files
+        if (opt.batch_mode) {
+          struct stat stFileInfo;
+          auto intstat = stat(opt.batch_file_name.c_str(), &stFileInfo);
+          if (intstat != 0) {
+            cerr << ERROR_STR << " file not found " << opt.batch_file_name << endl;
+            ret = false;
+          }
+          // open the file, parse and fill the batch_files values
+          std::ifstream bfile(opt.batch_file_name);
+          std::string line;
+          std::string id,f1,f2;
+          bool read_first_batch_file_line = false;
+          while (std::getline(bfile,line)) {
+            if (line.size() == 0) {
+              continue;
+            }
+            std::stringstream ss(line);
+            ss >> id;
+            if (id[0] == '#') {
+              continue;
+            }
+            opt.batch_ids.push_back(id);
+            ss >> f1 >> f2;
+            if (!read_first_batch_file_line) {
+              if (f2.empty()) {
+                opt.single_end = true;
+              } else {
+                opt.single_end = false;
+              }
+              read_first_batch_file_line = true;
+            }
+            if (opt.single_end) {
+              opt.batch_files.push_back({f1});
+              intstat = stat(f1.c_str(), &stFileInfo);
+              if (intstat != 0) {
+                cerr << ERROR_STR << " file not found " << f1 << endl;
+                ret = false;
+              }
+              if (!f2.empty()) {
+                cerr << ERROR_STR << " batch file malformatted" << endl;
+                ret = false;
+                break;
+              }
+            } else {
+              opt.batch_files.push_back({f1,f2});
+              intstat = stat(f1.c_str(), &stFileInfo);
+              if (intstat != 0) {
+                cerr << ERROR_STR << " file not found " << f1 << endl;
+                ret = false;
+              }
+              if (f2.empty()) {
+                cerr << ERROR_STR << " batch file malformatted" << endl;
+                ret = false;
+                break;
+              }
+              intstat = stat(f2.c_str(), &stFileInfo);
+              if (intstat != 0) {
+                cerr << ERROR_STR << " file not found " << f2 << endl;
+                ret = false;
+              }
+            }
+            f1.clear();
+            f2.clear();
+          }
+        }
+      }
+    }
+    if (opt.pseudobam) {
+      cerr << "Error: Pseudobam not supported yet in this mode" << endl;
+      ret = false;
+    }
+    if (opt.bam) {
+      cerr << "Error: --bam not supported in this mode" << endl;
+      ret = false;
+    }
+    if (!opt.tagsequence.empty()) {
+      cerr << "Error: --tag not supported in this mode" << endl;
+      ret = false;
+    }
+    if (opt.strand_specific && opt.strand == ProgramOptions::StrandType::None) {
+      opt.strand_specific = false;
+    }
+    if (ret) { // Set up BUS options
+      auto& busopt = opt.busOptions;
+      busopt.seq.push_back(BUSOptionSubstr(0,0,0));
+      busopt.umi.resize(0);
+      busopt.bc.resize(0);
+      if (opt.single_end) {
+        busopt.nfiles = 1;
+        busopt.paired = false;
+      } else {
+        busopt.nfiles = 2;
+        busopt.seq.push_back(BUSOptionSubstr(1,0,0));
+        busopt.paired = true;
+      }
+      busopt.umi.push_back(BUSOptionSubstr(-1,-1,-1));
+    }
+    return ret;
   } else {
     auto& busopt = opt.busOptions;
    
     if (opt.bam) { // Note: only 10xV2 has been tested
       busopt.nfiles = 1;
+      busopt.paired = false;
       if (opt.technology == "10XV2") {
         busopt.seq.push_back(BUSOptionSubstr(1,0,0)); // second file, entire string
-        busopt.umi = BUSOptionSubstr(0,16,26); // first file [16:26]
+        busopt.umi.push_back(BUSOptionSubstr(0,16,26)); // first file [16:26]
         busopt.bc.push_back(BUSOptionSubstr(0,0,16));
       } else if (opt.technology == "10XV3") {
         busopt.seq.push_back(BUSOptionSubstr(1,0,0));
-        busopt.umi = BUSOptionSubstr(0,16,28);
+        busopt.umi.push_back(BUSOptionSubstr(0,16,28));
         busopt.bc.push_back(BUSOptionSubstr(0,0,16));
 //      } else if (opt.technology == "10XV1") {
 
       } else if (opt.technology == "SURECELL") {
         busopt.seq.push_back(BUSOptionSubstr(1,0,0));
-        busopt.umi = BUSOptionSubstr(0,18,26);
+        busopt.umi.push_back(BUSOptionSubstr(0,18,26));
         busopt.bc.push_back(BUSOptionSubstr(0,0,18));
       } else if (opt.technology == "DROPSEQ") {
         busopt.seq.push_back(BUSOptionSubstr(1,0,0));
-        busopt.umi = BUSOptionSubstr(0,12,20);
+        busopt.umi.push_back(BUSOptionSubstr(0,12,20));
         busopt.bc.push_back(BUSOptionSubstr(0,0,12));
       } else if (opt.technology == "INDROPSV1") {
         busopt.nfiles = 2;
         busopt.seq.push_back(BUSOptionSubstr(1,0,0));
-        busopt.umi = BUSOptionSubstr(0,42,48);
+        busopt.umi.push_back(BUSOptionSubstr(0,42,48));
         busopt.bc.push_back(BUSOptionSubstr(0,0,11));
         busopt.bc.push_back(BUSOptionSubstr(0,30,38));  
       } else if (opt.technology == "INDROPSV2") {
         busopt.nfiles = 2;
         busopt.seq.push_back(BUSOptionSubstr(0,0,0));
-        busopt.umi = BUSOptionSubstr(1,42,48);
+        busopt.umi.push_back(BUSOptionSubstr(1,42,48));
         busopt.bc.push_back(BUSOptionSubstr(1,0,11));
         busopt.bc.push_back(BUSOptionSubstr(1,30,38));  
       } else if (opt.technology == "INDROPSV3") {
         busopt.nfiles = 3;
         busopt.seq.push_back(BUSOptionSubstr(2,0,0));
-        busopt.umi = BUSOptionSubstr(1,8,14);
+        busopt.umi.push_back(BUSOptionSubstr(1,8,14));
         busopt.bc.push_back(BUSOptionSubstr(0,0,8));
         busopt.bc.push_back(BUSOptionSubstr(1,0,8));
       } else if (opt.technology == "CELSEQ") {
         busopt.seq.push_back(BUSOptionSubstr(1,0,0));
-        busopt.umi = BUSOptionSubstr(0,8,12);
+        busopt.umi.push_back(BUSOptionSubstr(0,8,12));
         busopt.bc.push_back(BUSOptionSubstr(0,0,8));
       } else if (opt.technology == "CELSEQ2") {
         busopt.seq.push_back(BUSOptionSubstr(1,0,0));
-        busopt.umi = BUSOptionSubstr(0,0,6);
+        busopt.umi.push_back(BUSOptionSubstr(0,0,6));
         busopt.bc.push_back(BUSOptionSubstr(0,6,12));
+      } else if (opt.technology == "SPLIT-SEQ") {
+        busopt.nfiles = 2;
+        busopt.seq.push_back(BUSOptionSubstr(0,0,0));
+        busopt.umi.push_back(BUSOptionSubstr(1,0,10));
+        busopt.bc.push_back(BUSOptionSubstr(1,10,18));
+        busopt.bc.push_back(BUSOptionSubstr(1,48,56));
+        busopt.bc.push_back(BUSOptionSubstr(1,78,86));
       } else if (opt.technology == "SCRBSEQ") {
         busopt.seq.push_back(BUSOptionSubstr(1,0,0));
-        busopt.umi = BUSOptionSubstr(0,6,16);
+        busopt.umi.push_back(BUSOptionSubstr(0,6,16));
         busopt.bc.push_back(BUSOptionSubstr(0,0,6));
       } else if (opt.technology == "INDROPSV3") {
         busopt.seq.push_back(BUSOptionSubstr(1,0,0));
-        busopt.umi = BUSOptionSubstr(0,0,6);
+        busopt.umi.push_back(BUSOptionSubstr(0,0,6));
         busopt.bc.push_back(BUSOptionSubstr(0,6,16));
+      } else if (opt.technology == "SMARTSEQ3") {
+        busopt.nfiles = 4;
+        busopt.seq.push_back(BUSOptionSubstr(2,22,0));
+        busopt.seq.push_back(BUSOptionSubstr(3,0,0));
+        busopt.umi.push_back(BUSOptionSubstr(2,0,19));
+        busopt.bc.push_back(BUSOptionSubstr(0,0,0));
+        busopt.bc.push_back(BUSOptionSubstr(1,0,0));
+        busopt.paired = true;
+      } else if (opt.technology == "BULK") {
+        busopt.nfiles = 3;
+        busopt.seq.push_back(BUSOptionSubstr(2,0,0));
+        busopt.umi.push_back(BUSOptionSubstr(-1,-1,-1));
+        busopt.bc.push_back(BUSOptionSubstr(0,0,0));
+        busopt.bc.push_back(BUSOptionSubstr(1,0,0));
+        if (!opt.single_end) {
+          busopt.nfiles++;
+          busopt.seq.push_back(BUSOptionSubstr(3,0,0));
+          busopt.paired = true;
+        }
+      } else if (opt.technology == "BDWTA") {
+        busopt.nfiles = 2;
+        busopt.bc.push_back(BUSOptionSubstr(0, 0, 9)); // bc1 CLS1
+        // busopt.bc.push_back(BUSOptionSubstr(0,9,9+12)); // linker
+        busopt.bc.push_back(BUSOptionSubstr(0, 9 + 12, 9 + 12 + 9)); // bc2 CLS2
+        // busopt.bc.push_back(BUSOptionSubstr(0,9+12+9,9+12+9+13)); // linker
+        busopt.bc.push_back(BUSOptionSubstr(0, 9 + 12 + 9 + 13, 9 + 12 + 9 + 13 + 9));          // bc3 CLS3
+        busopt.umi.push_back(BUSOptionSubstr(0, 9 + 12 + 9 + 13 + 9, 9 + 12 + 9 + 13 + 9 + 8)); // umi
+        busopt.seq.push_back(BUSOptionSubstr(1, 0, 0));
       } else {
         vector<int> files;
         vector<BUSOptionSubstr> values;
@@ -922,13 +1277,17 @@ bool CheckOptionsBus(ProgramOptions& opt) {
         //bool invalid = ParseTechnology(opt.technology, values, files, errorList, bcValues);
         bool valid = ParseTechnology(opt.technology, busopt, errorList);
         
+        if (busopt.seq.size() == 2 && !opt.single_end) {
+          busopt.paired = true;
+        }
+        
         if(!valid) {
           /*
           busopt.nfiles = files.size(); 
           for(int i = 0; i < bcValues.size(); i++) {
             busopt.bc.push_back(bcValues[i]);
           }
-          busopt.umi = values[0];
+          busopt.umi.push_back(values[0]);
           busopt.seq.push_back(values[1]);
           */
         } else {
@@ -940,72 +1299,116 @@ bool CheckOptionsBus(ProgramOptions& opt) {
         }
       }
     } else {
+      busopt.paired = false;
       if (opt.technology == "10XV2") {
         busopt.nfiles = 2;
         busopt.seq.push_back(BUSOptionSubstr(1,0,0)); // second file, entire string
-        busopt.umi = BUSOptionSubstr(0,16,26); // first file [16:26]
+        busopt.umi.push_back(BUSOptionSubstr(0,16,26)); // first file [16:26]
         busopt.bc.push_back(BUSOptionSubstr(0,0,16));
+        strand = ProgramOptions::StrandType::FR;
       } else if (opt.technology == "10XV3") {
         busopt.nfiles = 2;
         busopt.seq.push_back(BUSOptionSubstr(1,0,0));
-        busopt.umi = BUSOptionSubstr(0,16,28);
+        busopt.umi.push_back(BUSOptionSubstr(0,16,28));
         busopt.bc.push_back(BUSOptionSubstr(0,0,16));
+        strand = ProgramOptions::StrandType::FR;
+      } else if (opt.technology == "VISIUM") {
+        busopt.nfiles = 2;
+        busopt.seq.push_back(BUSOptionSubstr(1,0,0));
+        busopt.umi.push_back(BUSOptionSubstr(0,16,28));
+        busopt.bc.push_back(BUSOptionSubstr(0,0,16));
+        strand = ProgramOptions::StrandType::FR;
       } else if (opt.technology == "10XV1") {
         busopt.nfiles = 3;
         busopt.seq.push_back(BUSOptionSubstr(2,0,0));
-        busopt.umi = BUSOptionSubstr(1,0,10);
+        busopt.umi.push_back(BUSOptionSubstr(1,0,10));
         busopt.bc.push_back(BUSOptionSubstr(0,0,14));
+        strand = ProgramOptions::StrandType::FR;
       } else if (opt.technology == "SURECELL") {
         busopt.nfiles = 2;
         busopt.seq.push_back(BUSOptionSubstr(1,0,0));
-        busopt.umi = BUSOptionSubstr(0,51,59);
+        busopt.umi.push_back(BUSOptionSubstr(0,51,59));
         busopt.bc.push_back(BUSOptionSubstr(0,0,6));
         busopt.bc.push_back(BUSOptionSubstr(0,21,27));
         busopt.bc.push_back(BUSOptionSubstr(0,42,48));
+        strand = ProgramOptions::StrandType::FR;
       } else if (opt.technology == "DROPSEQ") {
         busopt.nfiles = 2;
         busopt.seq.push_back(BUSOptionSubstr(1,0,0));
-        busopt.umi = BUSOptionSubstr(0,12,20);
+        busopt.umi.push_back(BUSOptionSubstr(0,12,20));
         busopt.bc.push_back(BUSOptionSubstr(0,0,12));
       } else if (opt.technology == "INDROPSV1") {
         busopt.nfiles = 2;
         busopt.seq.push_back(BUSOptionSubstr(1,0,0));
-        busopt.umi = BUSOptionSubstr(0,42,48);
+        busopt.umi.push_back(BUSOptionSubstr(0,42,48));
         busopt.bc.push_back(BUSOptionSubstr(0,0,11));
-        busopt.bc.push_back(BUSOptionSubstr(0,30,38));  
+        busopt.bc.push_back(BUSOptionSubstr(0,30,38)); 
       } else if (opt.technology == "INDROPSV2") {
         busopt.nfiles = 2;
         busopt.seq.push_back(BUSOptionSubstr(0,0,0));
-        busopt.umi = BUSOptionSubstr(1,42,48);
+        busopt.umi.push_back(BUSOptionSubstr(1,42,48));
         busopt.bc.push_back(BUSOptionSubstr(1,0,11));
-        busopt.bc.push_back(BUSOptionSubstr(1,30,38));  
+        busopt.bc.push_back(BUSOptionSubstr(1,30,38)); 
       } else if (opt.technology == "INDROPSV3") {
         busopt.nfiles = 3;
         busopt.seq.push_back(BUSOptionSubstr(2,0,0));
-        busopt.umi = BUSOptionSubstr(1,8,14);
+        busopt.umi.push_back(BUSOptionSubstr(1,8,14));
         busopt.bc.push_back(BUSOptionSubstr(0,0,8));
         busopt.bc.push_back(BUSOptionSubstr(1,0,8));
       } else if (opt.technology == "CELSEQ") {
         busopt.nfiles = 2;
         busopt.seq.push_back(BUSOptionSubstr(1,0,0));
-        busopt.umi = BUSOptionSubstr(0,8,12);
+        busopt.umi.push_back(BUSOptionSubstr(0,8,12));
         busopt.bc.push_back(BUSOptionSubstr(0,0,8));
+        strand = ProgramOptions::StrandType::FR;
       } else if (opt.technology == "CELSEQ2") {
         busopt.nfiles = 2;
         busopt.seq.push_back(BUSOptionSubstr(1,0,0));
-        busopt.umi = BUSOptionSubstr(0,0,6);
+        busopt.umi.push_back(BUSOptionSubstr(0,0,6));
         busopt.bc.push_back(BUSOptionSubstr(0,6,12));
+        strand = ProgramOptions::StrandType::FR;
+      } else if (opt.technology == "SPLIT-SEQ"){
+        busopt.nfiles = 2;
+        busopt.seq.push_back(BUSOptionSubstr(0,0,0));
+        busopt.umi.push_back(BUSOptionSubstr(1,0,10));
+        busopt.bc.push_back(BUSOptionSubstr(1,10,18));
+        busopt.bc.push_back(BUSOptionSubstr(1,48,56));
+        busopt.bc.push_back(BUSOptionSubstr(1,78,86));
       } else if (opt.technology == "SCRBSEQ") {
         busopt.nfiles = 2;
         busopt.seq.push_back(BUSOptionSubstr(1,0,0));
-        busopt.umi = BUSOptionSubstr(0,6,16);
+        busopt.umi.push_back(BUSOptionSubstr(0,6,16));
         busopt.bc.push_back(BUSOptionSubstr(0,0,6));
-      } else if (opt.technology == "INDROPSV3") {
+      } else if (opt.technology == "SMARTSEQ3") {
+        busopt.nfiles = 4;
+        busopt.seq.push_back(BUSOptionSubstr(2,22,0));
+        busopt.seq.push_back(BUSOptionSubstr(3,0,0));
+        busopt.umi.push_back(BUSOptionSubstr(2,0,19));
+        busopt.bc.push_back(BUSOptionSubstr(0,0,0));
+        busopt.bc.push_back(BUSOptionSubstr(1,0,0));
+        busopt.paired = true;
+        strand = ProgramOptions::StrandType::FR;
+      } else if (opt.technology == "BULK") {
         busopt.nfiles = 3;
         busopt.seq.push_back(BUSOptionSubstr(2,0,0));
-        busopt.umi = BUSOptionSubstr(1,8,14);
-        busopt.bc.push_back(BUSOptionSubstr(0,0,8));
-        busopt.bc.push_back(BUSOptionSubstr(1,0,8));
+        busopt.umi.push_back(BUSOptionSubstr(-1,-1,-1));
+        busopt.bc.push_back(BUSOptionSubstr(0,0,0));
+        busopt.bc.push_back(BUSOptionSubstr(1,0,0));
+        if (!opt.single_end) {
+          busopt.nfiles++;
+          busopt.seq.push_back(BUSOptionSubstr(3,0,0));
+          busopt.paired = true;
+        }
+      } else if (opt.technology == "BDWTA") {
+        busopt.nfiles = 2;
+        busopt.bc.push_back(BUSOptionSubstr(0, 0, 9)); // bc1 CLS1
+        // busopt.bc.push_back(BUSOptionSubstr(0,9,9+12)); // linker
+        busopt.bc.push_back(BUSOptionSubstr(0, 9 + 12, 9 + 12 + 9)); // bc2 CLS2
+        // busopt.bc.push_back(BUSOptionSubstr(0,9+12+9,9+12+9+13)); // linker
+        busopt.bc.push_back(BUSOptionSubstr(0, 9 + 12 + 9 + 13, 9 + 12 + 9 + 13 + 9));          // bc3 CLS3
+        busopt.umi.push_back(BUSOptionSubstr(0, 9 + 12 + 9 + 13 + 9, 9 + 12 + 9 + 13 + 9 + 8)); // umi
+        busopt.seq.push_back(BUSOptionSubstr(1, 0, 0));
+        strand = ProgramOptions::StrandType::FR;
       } else {
         vector<int> files;
         vector<BUSOptionSubstr> values;
@@ -1014,6 +1417,9 @@ bool CheckOptionsBus(ProgramOptions& opt) {
         //bool invalid = ParseTechnology(opt.technology, values, files, errorList, bcValues);
         bool valid = ParseTechnology(opt.technology, busopt, errorList);
         
+        if (busopt.seq.size() == 2 && !opt.single_end) {
+          busopt.paired = true;
+        }
         
         if(valid) {
           /*
@@ -1021,7 +1427,7 @@ bool CheckOptionsBus(ProgramOptions& opt) {
           for(int i = 0; i < bcValues.size(); i++) {
             busopt.bc.push_back(bcValues[i]);
           }
-          busopt.umi = values[0];
+          busopt.umi.push_back(values[0]);
           busopt.seq.push_back(values[1]);
           */
         } else {
@@ -1034,10 +1440,50 @@ bool CheckOptionsBus(ProgramOptions& opt) {
       }
     }
   }
+  
+  if (opt.tagsequence.empty() && (opt.technology == "SMARTSEQ3")) {
+    opt.tagsequence = "ATTGCGCAATG";
+    cerr << "[bus] Using " <<  opt.tagsequence << " as UMI tag sequence" << endl;
+  }
+  
+  if (opt.strand_specific && opt.strand == ProgramOptions::StrandType::None) {
+    opt.strand_specific = false; // User specified --unstranded
+  } else if (!opt.strand_specific && ret) {
+    opt.strand = strand;
+    if (opt.strand == ProgramOptions::StrandType::FR) {
+      cerr << "[bus] Note: Strand option was not specified; setting it to --fr-stranded for specified technology" << endl;
+      opt.strand_specific = true;
+    } else if (opt.strand == ProgramOptions::StrandType::RF) {
+      cerr << "[bus] Note: Strand option was not specified; setting it to --rf-stranded for specified technology" << endl;
+      opt.strand_specific = true;
+    } else {
+      cerr << "[bus] Note: Strand option was not specified; setting it to --unstranded for specified technology" << endl;
+    }
+  }
+  
+  if (!opt.tagsequence.empty()) {
+    opt.busOptions.umi[0].start += opt.tagsequence.length();
+    if (opt.busOptions.umi[0].start >= opt.busOptions.umi[0].stop) {
+      cerr << "Error: Tag sequence must be shorter than UMI sequence" << endl;
+      ret = false;
+    }
+  }
+  
+  if (!opt.single_end && !opt.busOptions.paired) {
+      cerr << "Error: Paired reads are not compatible with the specified technology" << endl;
+      ret = false;
+  }
+
+  if (opt.strand_specific) {
+    if (opt.busOptions.seq.size() != 1 && !(opt.busOptions.seq.size() == 2 && opt.busOptions.paired)) {
+      cerr << "Error: Strand-specific read processing is only supported for technologies with a single CDNA read file or paired-end reads" << endl;
+      ret = false;
+    }
+  }
 
   if (opt.genomebam) {
-    if (opt.busOptions.seq.size() != 1) {
-      cerr << "Error: BAM output is currently only supported for technologies with a single CDNA read file" << endl;
+    if (opt.busOptions.seq.size() != 1 && !(opt.busOptions.seq.size() == 2 && opt.busOptions.paired)) {
+      cerr << "Error: BAM output is currently only supported for technologies with a single CDNA read file or paired-end reads" << endl;
       ret = false;
     }
     if (!opt.gtfFile.empty()) {
@@ -1285,24 +1731,179 @@ bool CheckOptionsEM(ProgramOptions& opt, bool emonly = false) {
       //ret = false;
     }
   }
-
+
+  if (opt.bootstrap < 0) {
+    cerr << "Error: number of bootstrap samples must be a non-negative integer." << endl;
+    ret = false;
+  }
+
+  #ifndef USE_HDF5  
+  if (opt.bootstrap > 0 && !opt.plaintext) {
+    cerr << "Warning: kallisto was not compiled with HDF5 support so no bootstrapping" << endl
+         << "will be performed. Run quant with --plaintext option or recompile with" << endl
+         << "HDF5 support to obtain bootstrap estimates." << endl;
+    opt.bootstrap = 0;
+  }
+  #endif
+  return ret;
+}
+
+bool CheckOptionsTCCQuant(ProgramOptions& opt) {
+  
+  bool ret = true;
+  
+  cerr << endl;
+  // check for index
+  if (opt.index.empty() && opt.transcriptsFile.empty()) {
+    cerr << ERROR_STR << " either a kallisto index file or a transcripts file need to be supplied" << endl;
+    ret = false;
+  } else if (!opt.index.empty() && !opt.transcriptsFile.empty()) {
+    cerr << ERROR_STR << " cannot supply both a kallisto index file and a transcripts file" << endl;
+    ret = false;
+  } else if (!opt.index.empty()) {
+    struct stat stFileInfo;
+    auto intStat = stat(opt.index.c_str(), &stFileInfo);
+    if (intStat != 0) {
+      cerr << ERROR_STR << " kallisto index file not found " << opt.index << endl;
+      ret = false;
+    }
+  } else if (!opt.transcriptsFile.empty()) {
+    struct stat stFileInfo;
+    auto intStat = stat(opt.transcriptsFile.c_str(), &stFileInfo);
+    if (intStat != 0) {
+      cerr << ERROR_STR << " transcripts file not found " << opt.transcriptsFile << endl;
+      ret = false;
+    }
+  }
+
+  if (opt.tccFile.empty()) {
+    cerr << ERROR_STR << " transcript-compatibility counts file missing" << endl;
+    ret = false;
+  } else {
+    struct stat stFileInfo;
+    auto intStat = stat(opt.tccFile.c_str(), &stFileInfo);
+    if (intStat != 0) {
+      cerr << ERROR_STR << " transcript-compatibility counts file not found " << opt.tccFile << endl;
+      ret = false;
+    }
+  }
+  
+  if (opt.ecFile.empty() && !opt.transcriptsFile.empty()) {
+    cerr << ERROR_STR << " equivalence class file must be supplied if transcripts file is supplied " << opt.tccFile << endl;
+    ret = false;
+  }
+
+  if (!opt.ecFile.empty()) {
+    struct stat stFileInfo;
+    auto intStat = stat(opt.ecFile.c_str(), &stFileInfo);
+    if (intStat != 0) {
+      cerr << ERROR_STR << " equivalence class file not found " << opt.ecFile << endl;
+      ret = false;
+    }
+  }
+  
+  if (!opt.fldFile.empty()) {
+    struct stat stFileInfo;
+    auto intStat = stat(opt.fldFile.c_str(), &stFileInfo);
+    if (intStat != 0) {
+      cerr << ERROR_STR << " fragment length distribution file not found " << opt.fldFile << endl;
+      ret = false;
+    }
+  }
+  
+  if (!opt.genemap.empty() && !opt.gtfFile.empty()) {
+    cerr << ERROR_STR << " Cannot supply both --genemap and --gtf" << endl;
+    ret = false;
+  }
+  
+  if (!opt.gtfFile.empty()) {
+    struct stat stFileInfo;
+    auto intStat = stat(opt.gtfFile.c_str(), &stFileInfo);
+    if (intStat != 0) {
+      cerr << ERROR_STR << " GTF file not found " << opt.gtfFile << endl;
+      ret = false;
+    }
+  }
+  
+  if (!opt.genemap.empty()) {
+    struct stat stFileInfo;
+    auto intStat = stat(opt.genemap.c_str(), &stFileInfo);
+    if (intStat != 0) {
+      cerr << ERROR_STR << " file for mapping transcripts to genes not found " << opt.genemap << endl;
+      ret = false;
+    }
+  }
+  
+  if ((opt.fld != 0.0 || opt.sd != 0.0) && !opt.fldFile.empty()) {
+    cerr << ERROR_STR <<" cannot supply mean or sd while also supplying a fragment length distribution file" << endl;
+    ret = false;
+  }
+  
+  if ((opt.fld != 0.0 && opt.sd == 0.0) || (opt.sd != 0.0 && opt.fld == 0.0)) {
+    cerr << ERROR_STR << " cannot supply mean/sd without supplying both -l and -s" << endl;
+    ret = false;
+  }
+  
+  if (opt.index.empty() && (!opt.fldFile.empty() || opt.fld != 0.0 || opt.sd != 0.0)) {
+    cerr << ERROR_STR << " cannot supply fragment length information unless a kallisto index is provided" << endl;
+    ret = false;
+  }
+  
+  if (ret && opt.fld > 0.0 && opt.sd > 0.0) {
+    cerr << "[tcc] fragment length distribution is truncated gaussian with mean = " <<
+      opt.fld << ", sd = " << opt.sd << endl;
+  }
+  
+  if (opt.fld < 0.0) {
+    cerr << ERROR_STR << " invalid value for mean fragment length " << opt.fld << endl;
+    ret = false;
+  }
+  
+  if (opt.sd < 0.0) {
+    cerr << ERROR_STR << " invalid value for fragment length standard deviation " << opt.sd << endl;
+    ret = false;
+  }
+
+  if (opt.output.empty()) {
+    cerr << ERROR_STR << " need to specify output directory " << opt.output << endl;
+    ret = false;
+  } else {
+    struct stat stFileInfo;
+    auto intStat = stat(opt.output.c_str(), &stFileInfo);
+    if (intStat == 0) {
+      // file/dir exits
+      if (!S_ISDIR(stFileInfo.st_mode)) {
+        cerr << ERROR_STR << " file " << opt.output << " exists and is not a directory" << endl;
+        ret = false;
+      }
+    } else {
+      // create directory
+      if (my_mkdir(opt.output.c_str(), 0777) == -1) {
+        cerr << ERROR_STR << " could not create directory " << opt.output << endl;
+        ret = false;
+      }
+    }
+  }
+
+  if (opt.threads <= 0) {
+    cerr << ERROR_STR << " invalid number of threads " << opt.threads << endl;
+    ret = false;
+  } else {
+    unsigned int n = std::thread::hardware_concurrency();
+    if (n != 0 && n < opt.threads) {
+      cerr << "Warning: you asked for " << opt.threads
+           << ", but only " << n << " cores on the machine" << endl;
+    }
+  }
+  
   if (opt.bootstrap < 0) {
     cerr << "Error: number of bootstrap samples must be a non-negative integer." << endl;
     ret = false;
   }
 
-  #ifndef USE_HDF5  
-  if (opt.bootstrap > 0 && !opt.plaintext) {
-    cerr << "Warning: kallisto was not compiled with HDF5 support so no bootstrapping" << endl
-         << "will be performed. Run quant with --plaintext option or recompile with" << endl
-         << "HDF5 support to obtain bootstrap estimates." << endl;
-    opt.bootstrap = 0;
-  }
-  #endif
-  return ret;
+   return ret;
 }
 
-
 bool CheckOptionsMerge(ProgramOptions& opt) {
 
   bool ret = true;
@@ -1404,12 +2005,24 @@ bool CheckOptionsPseudo(ProgramOptions& opt) {
       ret = false;
     }
   }
+  
+  assert(!(opt.batch_bus && opt.batch_bus_write));
+  bool opt_bus_mode = opt.batch_bus || opt.batch_bus_write;
 
   if (opt.pseudo_quant) {
     if (!opt.batch_mode) {
       cerr << ERROR_STR << " --quant can only be run with in batch mode with --batch" << endl;
       ret = false;
     }
+    if (opt_bus_mode) {
+      cerr << ERROR_STR << " --quant cannot be run with --bus" << endl;
+      ret = false;
+    }
+  }
+  
+  if (opt_bus_mode && opt.umi) {
+    cerr << ERROR_STR << " UMI cannot be run with --bus" << endl;
+    ret = false;   
   }
 
   // check for read files
@@ -1418,8 +2031,45 @@ bool CheckOptionsPseudo(ProgramOptions& opt) {
       cerr << ERROR_STR << " UMI must be run in batch mode, use --batch option" << endl;
       ret = false;      
     }
-    
-    if (opt.files.size() == 0) {
+    if (opt_bus_mode && ret) {
+      cerr << "--bus specified; will try running read files in batch mode" << endl;
+      opt.batch_ids.push_back("sample");
+      opt.batch_mode = true;
+      opt.pseudo_read_files_supplied = true;
+      if (opt.files.size() != 1 && opt.files.size() != 2) {
+        cerr << ERROR_STR << " A minimum of one and a maximum of two read files must be provided" << endl;
+        ret = false;
+      } else if (opt.single_end && opt.files.size() != 1) {
+        cerr << ERROR_STR << " single-end mode requires exactly one read file" << endl;
+        ret = false;
+      } else {
+        std::string f1,f2;
+        struct stat stFileInfo;
+        if (opt.files.size() == 1) {
+          f1 = opt.files[0];
+          opt.batch_files.push_back({f1});
+          auto intstat = stat(f1.c_str(), &stFileInfo);
+          if (intstat != 0) {
+            cerr << ERROR_STR << " file not found " << f1 << endl;
+            ret = false;
+          }
+        } else {
+          f1 = opt.files[0];
+          f2 = opt.files[1];
+          opt.batch_files.push_back({f1,f2});
+          auto intstat = stat(f1.c_str(), &stFileInfo);
+          if (intstat != 0) {
+            cerr << ERROR_STR << " file not found " << f1 << endl;
+            ret = false;
+          }
+          intstat = stat(f2.c_str(), &stFileInfo);
+          if (intstat != 0) {
+            cerr << ERROR_STR << " file not found " << f2 << endl;
+            ret = false;
+          }
+        }
+      }
+    } else if (opt.files.size() == 0) {
       cerr << ERROR_STR << " Missing read files" << endl;
       ret = false;
     } else {
@@ -1511,6 +2161,12 @@ bool CheckOptionsPseudo(ProgramOptions& opt) {
     if (opt.fld != 0.0 || opt.sd != 0.0) {
       cerr << "[~warn] you supplied fragment length information for UMI data which will be ignored" << endl;
     }
+  } else if (opt_bus_mode) {
+    if (opt.fld != 0.0 || opt.sd != 0.0) {
+      cerr << "[~warn] you supplied fragment length information for --bus mode which will be ignored" << endl;
+    }
+    opt.fld = 0.0;
+    opt.sd = 0.0;
   } else {
     if ((opt.fld != 0.0 && opt.sd == 0.0) || (opt.sd != 0.0 && opt.fld == 0.0)) {
       cerr << "Error: cannot supply mean/sd without supplying both -l and -s" << endl;
@@ -1585,6 +2241,25 @@ bool CheckOptionsPseudo(ProgramOptions& opt) {
     cerr << "Error: Pseudobam not supported yet in pseudo mode, use quant mode to obtain BAM files" << endl;
     ret = false;
   }
+  
+  if (!opt_bus_mode && opt.num) {
+    cerr << "Warning: --bus option was not used, so --num option will be ignored" << endl;
+  }
+  
+  if (opt_bus_mode && ret) { // Set up BUS options
+    auto& busopt = opt.busOptions;
+    busopt.seq.push_back(BUSOptionSubstr(0,0,0));
+    busopt.umi.resize(0);
+    busopt.bc.resize(0);
+    if (opt.single_end) {
+      busopt.nfiles = 1;
+      busopt.paired = false;
+    } else {
+      busopt.nfiles = 2;
+      busopt.seq.push_back(BUSOptionSubstr(1,0,0));
+      busopt.paired = true;
+    }
+  }
 
   return ret;
 }
@@ -1701,8 +2376,8 @@ void usage() {
        << "Where <CMD> can be one of:" << endl << endl
        << "    index         Builds a kallisto index "<< endl
        << "    quant         Runs the quantification algorithm " << endl
+       << "    quant-tcc     Runs quantification on transcript-compatibility counts" << endl
        << "    bus           Generate BUS files for single-cell data " << endl
-       << "    pseudo        Runs the pseudoalignment step " << endl
        << "    merge         Merges several batch runs " << endl
        << "    h5dump        Converts HDF5-formatted results to plaintext" << endl
        << "    inspect       Inspects and gives information about an index" << endl 
@@ -1718,13 +2393,24 @@ void usageBus() {
        << "Required arguments:" << endl
        << "-i, --index=STRING            Filename for the kallisto index to be used for" << endl
        << "                              pseudoalignment" << endl
-       << "-o, --output-dir=STRING       Directory to write output to" << endl 
-       << "-x, --technology=STRING       Single-cell technology used " << endl << endl
+       << "-o, --output-dir=STRING       Directory to write output to" << endl << endl
        << "Optional arguments:" << endl
+       << "-x, --technology=STRING       Single-cell technology used " << endl
        << "-l, --list                    List all single-cell technologies supported" << endl
+       << "-B, --batch=FILE              Process files listed in FILE" << endl
        << "-t, --threads=INT             Number of threads to use (default: 1)" << endl
        << "-b, --bam                     Input file is a BAM file" << endl
        << "-n, --num                     Output number of read in flag column (incompatible with --bam)" << endl
+       << "-T, --tag=STRING              5′ tag sequence to identify UMI reads for certain technologies" << endl
+       << "    --fr-stranded             Strand specific reads for UMI-tagged reads, first read forward" << endl
+       << "    --rf-stranded             Strand specific reads for UMI-tagged reads, first read reverse" << endl
+       << "    --unstranded              Treat all read as non-strand-specific" << endl
+       << "    --paired                  Treat reads as paired" << endl
+       << "    --genomebam               Project pseudoalignments to genome sorted BAM file" << endl
+       << "-g, --gtf                     GTF file for transcriptome information" << endl
+       << "                              (required for --genomebam)" << endl
+       << "-c, --chromosomes             Tab separated file with chromosome names and lengths" << endl
+       << "                              (optional for --genomebam, but recommended)" << endl
        << "    --verbose                 Print out progress information every 1M proccessed reads" << endl;
 }
 
@@ -1800,7 +2486,7 @@ void usageEM(bool valid_input = true) {
 void usagePseudo(bool valid_input = true) {
   if (valid_input) {
     cout << "kallisto " << KALLISTO_VERSION << endl
-         << "Computes equivalence classes for reads and quantifies abundances" << endl << endl;
+         << "Computes equivalence classes for reads and quantifies abundances (deprecated)" << endl << endl;
   }
 
   cout << "Usage: kallisto pseudo [arguments] FASTQ-files" << endl << endl
@@ -1812,11 +2498,16 @@ void usagePseudo(bool valid_input = true) {
        << "-u  --umi                     First file in pair is a UMI file" << endl
        << "-b  --batch=FILE              Process files listed in FILE" << endl
        << "    --quant                   Quantify using EM algorithm (only in batch mode)" << endl
+       << "    --bus                     Output a BUS file" << endl
        << "    --single                  Quantify single-end reads" << endl
        << "-l, --fragment-length=DOUBLE  Estimated average fragment length" << endl
        << "-s, --sd=DOUBLE               Estimated standard deviation of fragment length" << endl
        << "                              (default: -l, -s values are estimated from paired" << endl
-       << "                               end data, but are required when using --single)" << endl
+       << "                               end data, but are required when using --single" << endl
+       << "                               unless outputting a BUS file via --bus)" << endl
+       << "    --fr-stranded             Strand specific reads, first read forward" << endl
+       << "    --rf-stranded             Strand specific reads, first read reverse" << endl
+       << "-n, --num                     Output number of read in BUS file flag column (only with --bus)" << endl
        << "-t, --threads=INT             Number of threads to use (default: 1)" << endl;
 //       << "    --pseudobam               Output pseudoalignments in SAM format to stdout" << endl;
 
@@ -1848,6 +2539,38 @@ void usageEMOnly() {
        << "    --plaintext               Output plaintext instead of HDF5" << endl << endl;
 }
 
+void usageTCCQuant(bool valid_input = true) {
+  if (valid_input) {
+    cout << "kallisto " << KALLISTO_VERSION << endl
+         << "Quantifies abundance from pre-computed transcript-compatibility counts" << endl << endl;
+  }
+  
+  cout << "Usage: kallisto quant-tcc [arguments] transcript-compatibility-counts-file" << endl << endl
+       << "Required arguments:" << endl
+       << "-o, --output-dir=STRING       Directory to write output to" << endl << endl
+       << "Optional arguments:" << endl
+       << "-i, --index=STRING            Filename for the kallisto index to be used" << endl
+       << "                              (required if file with names of transcripts not supplied)" << endl
+       << "-T, --txnames=STRING          File with names of transcripts" << endl
+       << "                              (required if index file not supplied)" << endl
+       << "-e, --ec-file=FILE            File containing equivalence classes" << endl
+       << "                              (default: equivalence classes are taken from the index)" << endl
+       << "-f, --fragment-file=FILE      File containing fragment length distribution" << endl
+       << "                              (default: effective length normalization is not performed)" << endl
+       << "-l, --fragment-length=DOUBLE  Estimated average fragment length" << endl
+       << "-s, --sd=DOUBLE               Estimated standard deviation of fragment length" << endl
+       << "                              (note: -l, -s values only should be supplied when" << endl
+       << "                               effective length normalization needs to be performed" << endl
+       << "                               but --fragment-file is not specified)" << endl
+       << "-t, --threads=INT             Number of threads to use (default: 1)" << endl
+       << "-g, --genemap                 File for mapping transcripts to genes" << endl
+       << "                              (required for obtaining gene-level abundances)" << endl
+       << "-G, --gtf=FILE                GTF file for transcriptome information" << endl
+       << "                              (can be used instead of --genemap for obtaining gene-level abundances)" << endl
+       << "-b, --bootstrap-samples=INT   Number of bootstrap samples (default: 0)" << endl
+       << "    --seed=INT                Seed for the bootstrap sampling (default: 42)" << endl;
+}
+
 std::string argv_to_string(int argc, char *argv[]) {
   std::string res;
   for (int i = 0; i < argc; ++i) {
@@ -1929,7 +2652,115 @@ int main(int argc, char *argv[]) {
       if (!CheckOptionsBus(opt)) {
         usageBus();
         exit(1);
-      } else {
+      } else if (opt.batch_mode) { // kallisto pseudo
+        // pseudoalign the reads
+        KmerIndex index(opt);
+        index.load(opt);
+        
+        MinCollector collection(index, opt);
+        int64_t num_processed = 0;
+        int64_t num_pseudoaligned = 0;
+        int64_t num_unique = 0;
+        
+        Transcriptome model; // empty model
+        if (!opt.gtfFile.empty()) {          
+          model.parseGTF(opt.gtfFile, index, opt, true);
+        }
+        MasterProcessor MP(index, opt, collection, model);
+        if (!opt.batch_mode) {
+          num_processed = ProcessReads(MP, opt);
+          collection.write((opt.output + "/pseudoalignments"));
+        } else {          
+          num_processed = ProcessBatchReads(MP,opt);
+        }
+        
+        std::string call = argv_to_string(argc, argv);
+        
+        if (!opt.batch_bus) {
+          for (int id = 0; id < MP.batchCounts.size(); id++) {
+            const auto &cc = MP.batchCounts[id];
+            for (const auto &p : cc) {
+              if (p.first < index.num_trans) {
+                num_unique += p.second;
+              }
+              num_pseudoaligned += p.second;
+            }
+          }
+        } else {
+          for (int i = 0; i < index.num_trans; i++) {
+            num_unique += collection.counts[i];          
+          }
+          for (int i = 0; i < collection.counts.size(); i++) {
+            num_pseudoaligned += collection.counts[i];
+          }
+        }
+        
+        std::ofstream transout_f((opt.output + "/transcripts.txt"));
+        for (const auto &t : index.target_names_) {
+          transout_f << t << "\n";
+        }
+        transout_f.close();
+        
+        plaintext_aux(
+          opt.output + "/run_info.json",
+          std::string(std::to_string(index.num_trans)),
+          std::string(std::to_string(0)), // no bootstraps in pseudo
+          std::string(std::to_string(num_processed)),
+          std::string(std::to_string(num_pseudoaligned)),
+          std::string(std::to_string(num_unique)),
+          KALLISTO_VERSION,
+          std::string(std::to_string(index.INDEX_VERSION)),
+          start_time,
+          call);
+
+        cerr << endl;
+        
+        std::string prefix = opt.output + "/matrix";
+        std::string ecfilename = prefix + ".ec";
+        std::string tccfilename = prefix + ".tcc.mtx";
+        std::string cellnamesfilename = prefix + ".cells";
+        std::string genelistname = prefix + ".genelist.txt";
+        std::string busbarcodelistname = prefix + ".barcodes";
+        std::string busoutputname = opt.output + "/output.bus";
+        
+        writeECList(ecfilename, index);
+        writeCellIds(cellnamesfilename, opt.batch_ids);
+        if (opt.batch_bus || opt.batch_bus_write) {
+          if (opt.batch_bus_write) {
+            writeBUSMatrix(busoutputname, MP.batchCounts, index.ecmap.size());
+          }
+          if (!MP.batchCounts.empty()) {
+            // Write out fake barcodes that identify each cell
+            std::vector<std::string> fake_bcs;
+            fake_bcs.reserve(MP.batchCounts.size());
+            for (size_t j = 0; j < MP.batchCounts.size(); j++) {
+              fake_bcs.push_back(binaryToString(j, BUSFORMAT_FAKE_BARCODE_LEN));
+            }
+            writeCellIds(busbarcodelistname, fake_bcs);
+          }
+          // Write out index:
+          index.write((opt.output + "/index.saved"), false);
+          // Write out fragment length distributions if reads paired-end:
+          if (!opt.single_end) {
+            std::ofstream flensout_f((opt.output + "/flens.txt"));
+            for (size_t id = 0; id < opt.batch_ids.size(); id++) {
+              std::vector<int> fld = MP.batchFlens[id];
+              for ( size_t i = 0 ; i < fld.size(); ++i ) {
+                if (i != 0) {
+                  flensout_f << " ";
+                }
+                flensout_f << fld[i];
+              }
+              flensout_f << "\n";
+            }
+            flensout_f.close();
+          }
+        }
+        if (!opt.gtfFile.empty()) {
+          // write out gene info
+          writeGeneList(genelistname, model);
+        }
+      } else { // kallisto bus -x
         int num_trans, index_version;
         int64_t num_processed = 0;
         int64_t num_pseudoaligned = 0;
@@ -2008,7 +2839,27 @@ int main(int argc, char *argv[]) {
           transout_f << t << "\n";
         }
         transout_f.close();
-
+        
+        std::vector<int> fld;
+        if (opt.busOptions.paired) {
+          fld = collection.flens; // copy
+          collection.compute_mean_frag_lens_trunc();
+          // Write out index:
+          index.write((opt.output + "/index.saved"), false);
+          // Write out fragment length distribution:
+          std::ofstream flensout_f((opt.output + "/flens.txt"));
+          for ( size_t i = 0 ; i < fld.size(); ++i ) {
+            if (i != 0) {
+              flensout_f << " ";
+            }
+            flensout_f << fld[i];
+          }
+          flensout_f << "\n";
+          flensout_f.close();
+        } else if (opt.busOptions.umi[0].fileno == -1) {
+          // Write out index:
+          index.write((opt.output + "/index.saved"), false);
+        }
 
         // gather stats
         num_unique = 0;
@@ -2395,6 +3246,302 @@ int main(int argc, char *argv[]) {
           exit(1);
         }
       }
+    } else if (cmd == "quant-tcc") {
+      if (argc==2) {
+        usageTCCQuant();
+        return 0;
+      }
+      ParseOptionsTCCQuant(argc-1,argv+1,opt);
+      if (!CheckOptionsTCCQuant(opt)) {
+        usageTCCQuant();
+        exit(1);
+      } else {
+        KmerIndex index(opt);
+        index.load(opt, false); // skip the k-mer map
+        size_t num_ecs = index.ecmap.size();
+        MinCollector collection(index, opt);
+        std::vector<std::vector<std::pair<int32_t, int32_t>>> batchCounts; // Stores TCCs
+        std::ifstream in((opt.tccFile));
+        bool firstline = true;
+        bool isMatrixFile = false;
+        size_t nrow = 0, ncol = 0, nlines = 0;
+        int i = 0, prevRow = 0, prevCol = 0;
+        if (in.is_open()) { // Read in TCC file
+          std::string line;
+          while (getline(in, line)) {
+            if (firstline) { // First line decides whether the file is in matrix format
+              firstline = false;
+              if (line.rfind("%%MatrixMarket", 0) == 0) {
+                std::cerr << "[tcc] Parsing transcript-compatibility counts (TCC) file as a matrix file" << std::endl;
+                isMatrixFile = true;
+                while (getline(in, line) && line.rfind("%", 0) == 0) { } // Skip header comments
+                std::stringstream ss(line);
+                ss >> nrow >> ncol >> nlines;
+                std::cerr << "[tcc] Matrix dimensions: " << pretty_num(nrow) << " x " << pretty_num(ncol) << std::endl;
+                batchCounts.assign(nrow, {});
+                if (opt.bootstrap > 0) {
+                  cerr << "[tcc] Warning: Bootstrap can only be used for non-matrix files; will not be performed" << endl;
+                }
+                continue;
+              } else {
+                std::cerr << "[tcc] Transcript-compatibility counts (TCC) file is not in matrix format; "
+                          << "it will not be parsed as a matrix file" << std::endl;
+                isMatrixFile = false;
+                batchCounts.assign(1, {});
+              }
+            }
+            std::stringstream ss(line);
+            int row, col, val;
+            if (isMatrixFile) {
+              if (i >= nlines) {
+                std::cerr << "[tcc] Warning: TCC matrix file contains additional lines which will not be read; "
+                          << "only " << pretty_num(nlines) << " entries, as specified on the first line, will be read." << std::endl;
+                break;
+              }
+              ss >> row >> col >> val;
+              if (row > nrow || col > ncol) {
+                std::cerr << "Error: TCC matrix file is malformed; " 
+                          << "row numbers or column numbers exceed the dimensions of the matrix." << std::endl;
+                exit(1);
+              }
+            } else {
+              ss >> col >> val;
+              col += 1; // Because we'll consider non-matrix TCC files to be zero-indexed
+              row = 1;
+              nrow = 1;
+              if (ncol < col) {
+                ncol = col;
+              }
+            }
+            if (row <= 0 || col <= 0) {
+              std::cerr << "Error: Invalid indices in TCC file." << std::endl;
+              exit(1);
+            }
+            if (row < prevRow || (row == prevRow && col <= prevCol)) {
+              std::cerr << "Error: TCC file is not sorted." << std::endl;
+              exit(1);
+            }
+            prevRow = row;
+            prevCol = col;
+            auto &bc = batchCounts[row-1];
+            bc.push_back({col-1, val});
+            i++;
+          }
+        } else {
+          std::cerr << "Error: could not open file " << opt.tccFile << std::endl;
+          exit(1);
+        }
+        if (isMatrixFile && i < nlines) {
+          std::cerr << "Error: Found only " << pretty_num(i) << " entries in TCC matrix file, "
+                    << "expected " << pretty_num(nlines) << std::endl;
+          exit(1);
+        }
+        if ((isMatrixFile && ncol != num_ecs) || (!isMatrixFile && ncol > num_ecs)) {
+          std::cerr << "Error: number of equivalence classes does not match index. Found "
+                    << pretty_num(ncol) << ", expected " << pretty_num(num_ecs) << std::endl;
+          exit(1);
+        }
+        
+        std::vector<std::vector<std::pair<int32_t, double>>> Abundance_mat, Abundance_mat_gene, TPM_mat, TPM_mat_gene;
+        Abundance_mat.resize(nrow, {});
+        Abundance_mat_gene.resize(nrow, {});
+        TPM_mat.resize(nrow, {});
+        TPM_mat_gene.resize(nrow, {});
+        std::vector<std::pair<double, double>> FLD_mat;
+        FLD_mat.resize(nrow, {});
+        std::vector<std::vector<int>> FLDs;
+        Transcriptome model; // empty model
+
+        std::ofstream transout_f((opt.output + "/transcripts.txt"));
+        for (const auto &t : index.target_names_) {
+          transout_f << t << "\n";
+        }
+        transout_f.close();
+        std::string prefix = opt.output + "/matrix";
+        std::string abtsvfilename = opt.output + "/abundance.tsv";
+        std::string gene_abtsvfilename = opt.output + "/abundance.gene.tsv";
+        std::string genelistname = opt.output + "/genes.txt";
+        std::string abfilename = prefix + ".abundance.mtx";
+        std::string abtpmfilename = prefix + ".abundance.tpm.mtx";
+        std::string gene_abfilename = prefix + ".abundance.gene.mtx";
+        std::string gene_abtpmfilename = prefix + ".abundance.gene.tpm.mtx";
+        std::string fldfilename = prefix + ".fld.tsv";
+
+        const bool calcEffLen = !opt.fldFile.empty() || opt.fld != 0.0;
+        if (calcEffLen && !opt.fldFile.empty()) { // Parse supplied fragment length distribution file
+          std::ifstream infld((opt.fldFile));
+          if (infld.is_open()) {
+            std::string line;
+            while (getline(infld, line)) {
+              if (line.empty() || line.rfind("#", 0) == 0) {
+                continue; // Ignore empty lines or lines that begin with a #
+              }
+              std::vector<int> tmp_vec;
+              std::stringstream ss(line);
+              while(ss.good()) {
+                std::string tmp_val;
+                getline(ss, tmp_val, ' ');
+                int tmp_val_num = std::atoi(tmp_val.c_str());
+                if (tmp_val_num < 0) {
+                  std::cerr << "Error: Fragment length distribution file contains invalid value: " 
+                            << tmp_val_num << std::endl;
+                  exit(1);
+                }
+                tmp_vec.push_back(tmp_val_num);
+              }
+              if (tmp_vec.size() != MAX_FRAG_LEN) {
+                std::cerr << "Error: Fragment length distribution file contains a line with " 
+                          << tmp_vec.size() << " values; expected: " << MAX_FRAG_LEN << std::endl;
+                exit(1);
+              }
+              FLDs.push_back(tmp_vec);
+            }
+            if (FLDs.size() != 1 && FLDs.size() != nrow) {
+              std::cerr << "Error: Fragment length distribution file contains " 
+                        << FLDs.size() << " valid lines; expected: " << nrow << std::endl;
+              exit(1);
+            }
+          } else {
+            std::cerr << "Error: could not open file " << opt.fldFile << std::endl;
+            exit(1);
+          }
+        }
+
+        if (!opt.genemap.empty()) { // Parse supplied gene mapping file
+          model.parseGeneMap(opt.genemap, index, opt);
+        } else if (!opt.gtfFile.empty()) {
+          model.parseGTF(opt.gtfFile, index, opt, true);
+        }
+        const bool gene_level_counting = !opt.genemap.empty() || !opt.gtfFile.empty();
+
+        std::cerr << "[quant] Running EM algorithm..."; std::cerr.flush();
+        auto EM_lambda = [&](int id) {
+          MinCollector collection(index, opt);
+          collection.counts.assign(index.ecmap.size(), 0);
+          const auto& bc = batchCounts[id];
+          for (const auto &p : bc) {
+            collection.counts[p.first] = p.second;
+          }
+          
+          std::vector<double> fl_means;
+          if (calcEffLen) {
+            if (opt.fld != 0.0) {
+              collection.init_mean_fl_trunc(opt.fld, opt.sd);
+            } else {
+              std::vector<int> fld;
+              if (FLDs.size() == 1) { // Only one distribution supplied; will use this for everything
+                fld = FLDs[0];
+              } else {
+                fld = FLDs[id];
+              }
+              collection.flens = fld;
+              collection.compute_mean_frag_lens_trunc(false);
+            }
+            fl_means = get_frag_len_means(index.target_lens_, collection.mean_fl_trunc);
+            double mean_fl = collection.get_mean_frag_len(true);
+            double sd_fl = collection.get_sd_frag_len();
+            FLD_mat[id] = {mean_fl, sd_fl};
+          } else { // Set mean fragment lengths to be target lengths so that effective lengths are all one
+            fl_means = std::vector<double>(index.target_lens_.begin(), index.target_lens_.end());
+          }
+
+          EMAlgorithm em(collection.counts, index, collection, fl_means, opt);
+          em.run(10000, 50, false, false);
+          
+          if (isMatrixFile) { // Update abundances matrix
+            auto &ab_m = Abundance_mat[id];
+            auto &tpm_m = TPM_mat[id];
+            std::vector<double> gc;
+            std::vector<double> gc_tpm;
+            if (gene_level_counting) {
+              gc.assign(model.genes.size(), 0.0);
+              gc_tpm.assign(model.genes.size(), 0.0);
+            }
+            auto tpm = counts_to_tpm(em.alpha_, em.eff_lens_);
+            for (int i = 0; i < em.alpha_.size(); i++) {
+              if (em.alpha_[i] > 0.0) {
+                ab_m.push_back({i,em.alpha_[i]});
+                tpm_m.push_back({i,tpm[i]});
+                if (gene_level_counting) {
+                  int g_id = model.transcripts[i].gene_id;
+                  if (g_id != -1) {
+                    gc[g_id] += em.alpha_[i];
+                    gc_tpm[g_id] += tpm[i];
+                  }
+                }
+              }
+            }
+            if (gene_level_counting) {
+              auto &ab_m_g = Abundance_mat_gene[id];
+              auto &tpm_m_g = TPM_mat_gene[id];
+              for (int i = 0; i < gc.size(); i++) {
+                if (gc[i] > 0.0) {
+                  ab_m_g.push_back({i, gc[i]});
+                  tpm_m_g.push_back({i, gc_tpm[i]});
+                }
+              }
+            }
+          } else { // Write plaintext abundances
+            plaintext_writer(abtsvfilename, em.target_names_,
+                             em.alpha_, em.eff_lens_, index.target_lens_);
+            if (gene_level_counting) {
+              plaintext_writer_gene(gene_abtsvfilename, em.target_names_,
+                                     em.alpha_, em.eff_lens_, model);
+            }
+          }
+          
+          if (opt.bootstrap > 0 && !isMatrixFile) {
+            auto B = opt.bootstrap;
+            std::mt19937_64 rand;
+            rand.seed( opt.seed );
+            std::vector<size_t> seeds;
+            for (auto s = 0; s < B; ++s) {
+              seeds.push_back( rand() );
+            }
+            cerr << endl;
+            for (auto b = 0; b < B; ++b) {
+              Bootstrap bs(collection.counts, index, collection, em.eff_lens_, seeds[b], fl_means, opt);
+              cerr << "[bstrp] running EM for the bootstrap: " << b + 1 << "\r";
+              auto res = bs.run_em();
+              plaintext_writer(opt.output + "/bs_abundance_" + std::to_string(b) + ".tsv",
+                               em.target_names_, res.alpha_, em.eff_lens_, index.target_lens_);
+            }
+            cerr << endl;
+          }
+        }; // end of EM_lambda
+        
+        std::vector<std::thread> workers;
+        int num_ids = nrow;
+        int id = 0;
+        while (id < num_ids) {
+          workers.clear();
+          int nt = std::min(opt.threads, (num_ids - id));
+          int first_id = id;
+          for (int i = 0; i < nt; i++,id++) {
+            workers.emplace_back(std::thread(EM_lambda, id));
+          }
+          for (int i = 0; i < nt; i++) {
+            workers[i].join();
+          }
+        }
+        std::cerr << " done" << std::endl;
+        
+        cerr << endl;
+        
+
+        if (isMatrixFile) {
+          writeSparseBatchMatrix(abfilename, Abundance_mat, index.num_trans);
+          writeSparseBatchMatrix(abtpmfilename, TPM_mat, index.num_trans);
+          if (gene_level_counting) {
+            writeSparseBatchMatrix(gene_abfilename, Abundance_mat_gene, model.genes.size());
+            writeSparseBatchMatrix(gene_abtpmfilename, TPM_mat_gene, model.genes.size());
+            writeGeneList(genelistname, model, true);
+          }
+        }
+        if (calcEffLen) {
+          writeFLD(fldfilename, FLD_mat);
+        }
+      }
     } else if (cmd == "pseudo") {
       if (argc==2) {
         usagePseudo();
@@ -2429,14 +3576,22 @@ int main(int argc, char *argv[]) {
 
         std::string call = argv_to_string(argc, argv);
 
-
-        for (int id = 0; id < MP.batchCounts.size(); id++) {
-          const auto &cc = MP.batchCounts[id];
-          for (const auto &p : cc) {
-            if (p.first < index.num_trans) {
-              num_unique += p.second;
+        if (!opt.batch_bus) {
+          for (int id = 0; id < MP.batchCounts.size(); id++) {
+            const auto &cc = MP.batchCounts[id];
+            for (const auto &p : cc) {
+              if (p.first < index.num_trans) {
+                num_unique += p.second;
+              }
+              num_pseudoaligned += p.second;
             }
-            num_pseudoaligned += p.second;
+          }
+        } else {
+          for (int i = 0; i < index.num_trans; i++) {
+            num_unique += collection.counts[i];          
+          }
+          for (int i = 0; i < collection.counts.size(); i++) {
+            num_pseudoaligned += collection.counts[i];
           }
         }
         
@@ -2546,10 +3701,44 @@ int main(int argc, char *argv[]) {
         std::string fldfilename = prefix + ".fld.tsv";
         std::string genelistname = prefix + ".genelist.txt";
         std::string genecountname = prefix + ".genes.mtx";
+        std::string busbarcodelistname = prefix + ".barcodes";
+        std::string busoutputname = opt.output + "/output.bus";
 
         writeECList(ecfilename, index);
         writeCellIds(cellnamesfilename, opt.batch_ids);
-        writeSparseBatchMatrix(tccfilename, MP.batchCounts, index.ecmap.size());
+        if (opt.batch_bus || opt.batch_bus_write) {
+          if (opt.batch_bus_write) {
+            writeBUSMatrix(busoutputname, MP.batchCounts, index.ecmap.size());
+          }
+          if (!MP.batchCounts.empty()) {
+            // Write out fake barcodes that identify each cell
+            std::vector<std::string> fake_bcs;
+            fake_bcs.reserve(MP.batchCounts.size());
+            for (size_t j = 0; j < MP.batchCounts.size(); j++) {
+              fake_bcs.push_back(binaryToString(j, BUSFORMAT_FAKE_BARCODE_LEN));
+            }
+            writeCellIds(busbarcodelistname, fake_bcs);
+          }
+          // Write out index:
+          index.write((opt.output + "/index.saved"), false);
+          // Write out fragment length distributions if reads paired-end:
+          if (!opt.single_end) {
+            std::ofstream flensout_f((opt.output + "/flens.txt"));
+            for (size_t id = 0; id < opt.batch_ids.size(); id++) {
+              std::vector<int> fld = MP.batchFlens[id];
+              for ( size_t i = 0 ; i < fld.size(); ++i ) {
+                if (i != 0) {
+                  flensout_f << " ";
+                }
+                flensout_f << fld[i];
+              }
+              flensout_f << "\n";
+            }
+            flensout_f.close();
+          }
+        } else {
+          writeSparseBatchMatrix(tccfilename, MP.batchCounts, index.ecmap.size());
+        }
         if (opt.pseudo_quant) {
           writeSparseBatchMatrix(abfilename, Abundance_mat, index.num_trans);
           writeFLD(fldfilename, FLD_mat);



View it on GitLab: https://salsa.debian.org/med-team/kallisto/-/compare/a1ccc5194630a474877de4bad34ce53508312a1c...1e5743ab0d054a3a530ba2322e48f4c92336b43c

-- 
View it on GitLab: https://salsa.debian.org/med-team/kallisto/-/compare/a1ccc5194630a474877de4bad34ce53508312a1c...1e5743ab0d054a3a530ba2322e48f4c92336b43c
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20220119/cf885e4b/attachment-0001.htm>