[med-svn] [hilive] 01/02: Imported Upstream version 0.0+20150926
Andreas Tille
tille at debian.org
Fri Jul 29 12:19:24 UTC 2016
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository hilive.
commit da4bbd386340613584cf2fb25dc783ba843170d2
Author: Andreas Tille <tille at debian.org>
Date: Fri Jul 29 14:17:36 2016 +0200
Imported Upstream version 0.0+20150926
---
CMakeLists.txt | 57 ++++
LICENSE | 12 +
README | 86 +++++
lib/alnblock.cpp | 359 +++++++++++++++++++++
lib/alnblock.h | 81 +++++
lib/alnread.cpp | 823 +++++++++++++++++++++++++++++++++++++++++++++++
lib/alnread.h | 135 ++++++++
lib/alnstream.cpp | 727 +++++++++++++++++++++++++++++++++++++++++
lib/alnstream.h | 177 ++++++++++
lib/config.h.in | 7 +
lib/definitions.h | 157 +++++++++
lib/headers.h | 24 ++
lib/illumina_parsers.cpp | 70 ++++
lib/illumina_parsers.h | 59 ++++
lib/kindex.cpp | 595 ++++++++++++++++++++++++++++++++++
lib/kindex.h | 103 ++++++
lib/parallel.cpp | 287 +++++++++++++++++
lib/parallel.h | 130 ++++++++
lib/tools.cpp | 253 +++++++++++++++
lib/tools.h | 46 +++
tools/.ann.location | 1 +
tools/build_index.cpp | 125 +++++++
tools/hilive.cpp | 477 +++++++++++++++++++++++++++
23 files changed, 4791 insertions(+)
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..0b2562d
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,57 @@
+# Header
+cmake_minimum_required (VERSION 2.8)
+project (HiLive)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -g -pthread -W -Wall -std=gnu++11 -O2")
+
+# Set the k-mer length
+set (HiLive_K 15 CACHE INT "Set the k-mer length for index and mapper")
+
+# Set the version number
+set (HiLive_VERSION_MAJOR 0)
+set (HiLive_VERSION_MINOR 1)
+
+configure_file (
+ "${PROJECT_SOURCE_DIR}/lib/config.h.in"
+ "${PROJECT_BINARY_DIR}/config.h" )
+
+# add the binary tree to the search path for include files
+include_directories("${PROJECT_BINARY_DIR}")
+
+
+
+include_directories(lib)
+
+set(LIBS tools
+ alnblock
+ alnread
+ alnstream
+ illumina_parsers
+ kindex
+ parallel)
+
+set(LIB_FILES "")
+foreach (lib ${LIBS})
+ list(APPEND LIB_FILES "lib/${lib}.cpp")
+endforeach()
+add_library(hilivelib ${LIB_FILES})
+
+set(Boost_USE_STATIC_LIBS ON)
+set(Boost_USE_MULTITHREADED ON)
+set(Boost_USE_STATIC_RUNTIME ON)
+FIND_PACKAGE( Boost COMPONENTS system filesystem program_options REQUIRED )
+INCLUDE_DIRECTORIES( ${Boost_INCLUDE_DIR} )
+
+find_package( ZLIB REQUIRED )
+
+# manually set the path for lz4 library
+set (LZ4_PATH /usr/local/lib CACHE PATH "Manually set the path to the lz4 library")
+LINK_DIRECTORIES(${LZ4_PATH})
+
+
+# create the executables
+add_executable(hilive tools/hilive.cpp )
+target_link_libraries(hilive hilivelib ${ZLIB_LIBRARIES} ${Boost_LIBRARIES} lz4)
+
+add_executable(hilive-build tools/build_index.cpp )
+target_link_libraries(hilive-build hilivelib ${ZLIB_LIBRARIES} ${Boost_LIBRARIES} lz4)
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..734932c
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,12 @@
+Copyright (c) 2015, Martin S. Lindner, marzin at mail-lindner.de
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROF [...]
\ No newline at end of file
diff --git a/README b/README
new file mode 100644
index 0000000..f7c9a15
--- /dev/null
+++ b/README
@@ -0,0 +1,86 @@
+HiLive - Live Mapping of Illumina reads
+
+1. Description
+--------------
+HiLive is a read mapping tool that maps Illumina HiSeq (or comparable)
+reads right in the moment when they are produced. This means, read mapping
+is finished as soon as the sequencer is finished.
+
+
+2. Website
+----------
+
+The HiLive project website is https://sourceforge.net/projects/hilive/
+
+There you can find the latest version of HiLive, source code, documentation,
+and examples.
+
+
+3. Installation
+---------------
+
+If the binary distribution from the project website does not work for you,
+you can still compile HiLive from source.
+
+Make sure that the following dependences are installed:
+ - cmake (>= 2.8)
+ - boost (system, filesystem, program_options)
+ - zlib
+ - lz4
+
+Check out the source code from the project website. Compile HiLive with:
+ cd [hilive-code]
+ mkdir build && cd build
+ cmake ..
+ make
+
+To compile HiLive with a different k-mer size than k=15 make the following
+adjustment (here: k=10):
+ cmake -DHiLive_K 10 ..
+ make
+
+
+4. Usage
+--------
+
+HiLive has two components:
+ - hilive-build: build the k-mer index of the reference genome
+ - hilive: the read mapper itself
+
+Using hilive-build:
+
+Building a k-mer index from FASTA file input.fa:
+ hilive-build input.fa
+The index is written to input.fa.kix
+
+Building an index from a large reference genome. Here is makes sense to use trimming,
+i.e. removing k-mers from the index that occurr more than 1000 times (for example) in
+the index. The index is written into the file trimmed.kix
+ hilive-build -t 1000 -o trimmed.kix input.fa
+
+Using hilive:
+
+To map reads in a 100bp run using default settings:
+ hilive /path/to/BaseCalls /path/to/index.kix 100
+
+We recommend to adjust the numbers of threads used by HiLive with -n. If possible,
+make use of all threads on the machine!
+
+Please consult the project website for more details on the parameters!
+
+
+5. License
+----------
+
+See the file LICENSE for licensing information.
+
+
+6. Contact
+----------
+
+Please consult the HiLive project website for questions!
+
+If this does not help, please feel free to consult:
+
+Martin S. Lindner <martin (at) mail-lindner.de> (technical contact)
+Bernhard Y. Renard <renardb (at) rki.de> (project head)
diff --git a/lib/alnblock.cpp b/lib/alnblock.cpp
new file mode 100644
index 0000000..be4e979
--- /dev/null
+++ b/lib/alnblock.cpp
@@ -0,0 +1,359 @@
+#include "alnblock.h"
+
+
+uint16_t DatasetAlignment::get_cycle() {
+ return cycle;
+}
+
+uint16_t DatasetAlignment::get_lane() {
+ return lane;
+}
+
+uint16_t DatasetAlignment::get_tile() {
+ return tile;
+}
+
+std::string DatasetAlignment::get_root() {
+ return root;
+}
+
+CountType DatasetAlignment::get_read_length() {
+ return rlen;
+}
+
+std::string DatasetAlignment::get_bcl_file(uint16_t cl /* = 0 */) {
+ std::ostringstream path_stream;
+ uint16_t thiscycle = cl;
+ if ( cl == 0 ) {
+ // only allow cycle numbers > 0
+ assert(cycle > 0);
+ thiscycle = cycle;
+ }
+ path_stream << root << "/L00" << int(lane) << "/C" << int(thiscycle) << ".1/s_"<< int(lane) <<"_" << int(tile) << ".bcl";
+ return path_stream.str();
+}
+
+
+std::string DatasetAlignment::get_alignment_file() {
+ std::ostringstream path_stream;
+ path_stream << root << "/L00" << int(lane) << "/s_"<< int(lane) << "_" << int(tile) << ".align";
+ return path_stream.str();
+}
+
+
+std::string DatasetAlignment::get_filter_file() {
+ std::ostringstream path_stream;
+ path_stream << root << "/L00" << int(lane) << "/s_"<< int(lane) << "_" << int(tile) << ".filter";
+ return path_stream.str();
+}
+
+
+std::vector<char> DatasetAlignment::serialize() {
+ // calculate total size first
+ unsigned long int total_size = 0;
+
+ total_size += sizeof(uint16_t); // lane
+ total_size += sizeof(uint16_t); // tile
+ total_size += sizeof(CountType); // cycle
+
+ // root directory name
+ uint16_t root_size = root.size();
+ total_size += sizeof(uint16_t);
+ total_size += root_size;
+
+ // number of reads
+ unsigned long int num_reads = reads.size();
+ total_size += sizeof(uint64_t);
+
+ // read length
+ total_size += sizeof(CountType);
+
+ // sum up the sizes of the single alignments
+ for (auto it = reads.begin(); it != reads.end(); ++it) {
+ total_size += sizeof(uint32_t);
+ total_size += (*it).serialize_size();
+ }
+
+ // create the vector to store the data
+ std::vector<char> data (total_size);
+ char* d = data.data();
+
+ // write the lane
+ memcpy(d,&lane,sizeof(uint16_t));
+ d += sizeof(uint16_t);
+
+ // write the tile
+ memcpy(d,&tile,sizeof(uint16_t));
+ d += sizeof(uint16_t);
+
+ // write the cycle
+ memcpy(d,&cycle,sizeof(CountType));
+ d += sizeof(CountType);
+
+ // root directory name
+ memcpy(d,&root_size,sizeof(uint16_t));
+ d += sizeof(uint16_t);
+
+ memcpy(d,root.c_str(),root_size);
+ d += root_size;
+
+ // write the read length
+ memcpy(d,&rlen,sizeof(CountType));
+ d += sizeof(CountType);
+
+ // write the number of reads
+ memcpy(d,&num_reads,sizeof(uint64_t));
+ d += sizeof(uint64_t);
+
+ // write the reads
+ int i = 0;
+ for (auto it = reads.begin(); it != reads.end(); ++it,++i) {
+
+ std::vector<char> sread = it->serialize();
+ uint32_t size = sread.size();
+
+ memcpy(d,&size,sizeof(uint32_t));
+ d += sizeof(uint32_t);
+ memcpy(d,sread.data(),size);
+ d += size;
+
+ }
+
+ return data;
+}
+
+
+uint64_t DatasetAlignment::serialize_file(std::string f /* = "" */) {
+ std::string fname = get_alignment_file();
+ if ( f != "" ) {
+ fname = f;
+ }
+
+ // serialize data
+ std::vector<char> sdata = serialize();
+
+ // write data
+ return write_binary_file(fname,sdata);
+
+}
+
+
+
+uint64_t DatasetAlignment::deserialize(char* d) {
+ // the total number of bytes read
+ uint64_t bytes = 0;
+
+ // read the lane
+ memcpy(&lane,d+bytes,sizeof(uint16_t));
+ bytes += sizeof(uint16_t);
+
+ // read the tile
+ memcpy(&tile,d+bytes,sizeof(uint16_t));
+ bytes += sizeof(uint16_t);
+
+ // read the cycle
+ memcpy(&cycle,d+bytes,sizeof(unsigned short));
+ bytes += sizeof(unsigned short);
+
+ // read the root directory name
+ uint16_t root_size;
+ memcpy(&root_size,d+bytes,sizeof(uint16_t));
+ bytes += sizeof(uint16_t);
+
+ char tmp[root_size+1];
+ memcpy(tmp,d+bytes,root_size);
+ tmp[root_size] = 0; // make the string null-terminated
+ root = tmp;
+ bytes += root_size;
+
+ // read the read length
+ memcpy(&rlen,d+bytes,sizeof(CountType));
+ bytes += sizeof(CountType);
+
+ // read the number of reads
+ unsigned long int num_reads;
+ memcpy(&num_reads,d+bytes,sizeof(uint64_t));
+ bytes += sizeof(uint64_t);
+
+ // reading the reads
+ reads.clear();
+ reads.reserve(num_reads);
+ for (unsigned int i = 0; i < num_reads; ++i) {
+ // get the size of the read
+ uint32_t size;
+ memcpy(&size,d+bytes,sizeof(uint32_t));
+ bytes += sizeof(uint32_t);
+
+ // read as a chunk and deserialize
+ reads.emplace_back(ReadAlignment(rlen));
+ //if (flags[i]) {
+ std::vector<char> sread (size);
+ memcpy(sread.data(),d+bytes,size);
+ reads.back().deserialize(sread.data());
+ //}
+
+ bytes += size;
+ }
+
+ return bytes;
+}
+
+
+unsigned long int DatasetAlignment::deserialize_file(std::string f /* = "" */) {
+ std::string fname = get_alignment_file();
+ if ( f != "" ) {
+ fname = f;
+ }
+
+ // read raw data from file
+ std::vector<char> sdata = read_binary_file(fname);
+
+ // deserialize data
+ deserialize(sdata.data());
+
+ return sdata.size();
+}
+
+
+void DatasetAlignment::initialize(std::string rt, int ln, int tl, int rl) {
+ // set the basic stuff
+ root = rt;
+ lane = ln;
+ tile = tl;
+ cycle = 0;
+ rlen = rl;
+
+ // get the number of reads in this tile by looking in the first bcl file
+ std::string fname = get_bcl_file(1);
+
+ // extract the number of reads
+ uint32_t num_reads = num_reads_from_bcl(fname);
+
+ // create vector for read alignments
+ reads.clear();
+ reads.reserve(num_reads);
+ for (uint32_t i = 0; i < num_reads; i++){
+ reads.emplace_back(ReadAlignment(rlen));
+ }
+}
+
+
+
+void DatasetAlignment::add_nucleotide(KixRun* index, AlignmentSettings* settings) {
+
+ // TODO: check the prerequisites: bcl file must be available
+
+ // ----------------------------------------
+ // read the BCL file
+ // ----------------------------------------
+ std::string fname = get_bcl_file(cycle+1);
+
+ std::vector<char> bcl_data = read_binary_file(fname);
+
+ // extract the number of reads from the BCL file
+ uint64_t num_reads;
+ memcpy(&num_reads,bcl_data.data(),4);
+
+ if (num_reads != reads.size()){
+ std::cerr << "Input Error: Number of base calls (" << num_reads << ") does not match the number of reads in this dataset (" << reads.size() << ")." << std::endl;
+ return;
+ }
+
+ // pointer to the beginning of the base call data block
+ char* base_calls = bcl_data.data() + 4;
+
+ // ----------------------------------------
+ // load the filter flags
+ // ----------------------------------------
+ std::string filter_fname = get_filter_file();
+
+ std::vector<char> filterdata = read_binary_file(filter_fname);
+
+ // extract the number of reads from the filter file
+ unsigned int num_reads_filter;
+ memcpy(&num_reads_filter,filterdata.data()+8,4);
+
+ if (num_reads != num_reads_filter){
+ std::cerr << "Input Error: Number of reads in filter file (" << num_reads_filter << ") does not match the number of reads in the BCL file (" << num_reads << ")." << std::endl;
+ return;
+ }
+
+ // pointer to the beginning of the base call data block
+ char* filter_flags = filterdata.data() + 12;
+
+
+ // ----------------------------------------
+ // update the reads
+ // ----------------------------------------
+
+ // timing stuff
+ std::chrono::high_resolution_clock::duration d_vec = std::chrono::high_resolution_clock::duration::zero();
+ std::chrono::high_resolution_clock::duration d_seed = std::chrono::high_resolution_clock::duration::zero();
+ std::chrono::high_resolution_clock::duration d_add = std::chrono::high_resolution_clock::duration::zero();
+ std::chrono::high_resolution_clock::duration d_rem = std::chrono::high_resolution_clock::duration::zero();
+ std::chrono::high_resolution_clock::duration d_sort = std::chrono::high_resolution_clock::duration::zero();
+
+
+
+ auto rit = reads.begin();
+ for (uint64_t i = 0; i < num_reads; ++i, /*++fit,*/ ++rit) {
+
+ // extend the alignment by one nucleotide, if flags are ok
+ if (*(filter_flags+i) > 0) {
+ std::vector<std::chrono::high_resolution_clock::duration> t = (*rit).extend_alignment(*(base_calls+i), index, settings);
+ d_vec += t[0];
+ d_seed += t[1];
+ d_add += t[2];
+ d_rem += t[3];
+ d_sort += t[4];
+ }else {
+ (*rit).seeds.clear();
+ }
+ }
+
+ cycle++;
+
+ std::cout << "Time gather seeds: " << d_vec.count()/1000 << std::endl;
+ std::cout << "Time extend seeds: " << d_seed.count()/1000 << std::endl;
+ std::cout << "Time add new seeds: " << d_add.count()/1000 << std::endl;
+ std::cout << "Time filter seeds: " << d_rem.count()/1000 << std::endl;
+ std::cout << "Time sorting seeds: " << d_sort.count()/1000 << std::endl;
+
+
+ return;
+}
+
+
+
+void DatasetAlignment::report_alignments_spam(std::string fname, KixRun* idx) {
+
+ // open output file for writing
+ FILE* ofile;
+ ofile = fopen(fname.c_str(), "w");
+ if (!ofile) {
+ std::cerr << "Error opening output file " << fname << std::endl;
+ return;
+ }
+
+ // write out all seeds of each read
+ uint64_t rd = 0;
+ for (auto rit = reads.begin(); rit != reads.end(); ++rit, ++rd) {
+ // compose a read name
+ std::string rname = std::string("read_") + std::to_string(rd);
+
+ for (auto sit = rit->seeds.begin(); sit != rit->seeds.end(); ++sit) {
+ PositionType pos = (*sit)->start_pos;
+ if (pos < 0) {
+ pos = -pos - rlen + K;
+ }
+ std::stringstream out;
+ out << rname << "\t" << idx->get_name((*sit)->gid) << "\t" << pos << "\t" << CIGAR(**sit) << std::endl;
+ fwrite(out.str().c_str(), 1, out.str().size(), ofile);
+ }
+
+ }
+
+ fclose(ofile);
+}
+
+
diff --git a/lib/alnblock.h b/lib/alnblock.h
new file mode 100644
index 0000000..9e06963
--- /dev/null
+++ b/lib/alnblock.h
@@ -0,0 +1,81 @@
+#ifndef ALNBLOCK_H
+#define ALNBLOCK_H
+
+#include "headers.h"
+#include "definitions.h"
+#include "kindex.h"
+#include "tools.h"
+#include "alnread.h"
+
+
+//-------------------------------------------------------------------//
+//------ The Dataset-Alignment class ------------------------------//
+//-------------------------------------------------------------------//
+
+class DatasetAlignment {
+
+ // dataset information
+ uint16_t cycle;
+ uint16_t lane;
+ uint16_t tile;
+ std::string root; // the BaseCalls directory
+ CountType rlen;
+
+ public:
+
+ // getter functions for the dataset information
+ uint16_t get_cycle();
+ uint16_t get_lane();
+ uint16_t get_tile();
+ std::string get_root();
+ CountType get_read_length();
+
+ void set_cycle(uint16_t c) {cycle=c;}
+
+ // get the path to the bcl file of a given cycle. cl = 0 denotes the current cycle.
+ std::string get_bcl_file(uint16_t cl = 0);
+
+ // get the path to the alignment file. The alignment file is located in
+ // <root>/L00<lane>/s_<lane>_<tile>.align
+ std::string get_alignment_file();
+
+ // get the path to the filter file. The illumina filter information is located in
+ // <root>/L00<lane>/s_<lane>_<tile>.filter
+ std::string get_filter_file();
+
+ // illumina validity flags for all reads
+ /* std::vector<bool> flags; */
+
+ // all read alignments
+ std::vector<ReadAlignment> reads;
+
+ // serialize the object into a char vector
+ std::vector<char> serialize();
+
+ // serialize the object into a binary file
+ uint64_t serialize_file(std::string f = "");
+
+ // deserialize (read) data from a char vector
+ uint64_t deserialize(char* d);
+
+ // deserialize data from a binary file
+ uint64_t deserialize_file(std::string f = "");
+
+ // initialize the dataset alignment with
+ // 1. BaseCalls directory
+ // 2. Lane number
+ // 3. Tile number
+ // 4. Read length of Read 1
+ void initialize(std::string rt, int ln, int tl, int rl);
+
+ // extend alignment by one nucleotide
+ void add_nucleotide(KixRun* index, AlignmentSettings* settings);
+
+ // report alignments in SPAM format
+ void report_alignments_spam(std::string fname, KixRun* idx);
+
+}; // END class DatasetAlignment
+
+
+
+#endif /* ALNBLOCK_H */
diff --git a/lib/alnread.cpp b/lib/alnread.cpp
new file mode 100644
index 0000000..ed892a4
--- /dev/null
+++ b/lib/alnread.cpp
@@ -0,0 +1,823 @@
+#include <time.h>
+#include <chrono>
+
+#include "alnread.h"
+
+
+bool seed_compare (Seed i,Seed j) {
+ return (i.start_pos < j.start_pos);
+}
+
+bool seed_compare_pos (const USeed & i, const USeed & j) {
+ return (i->start_pos < j->start_pos);
+}
+
+bool seed_compare_num_matches (const USeed & i, const USeed & j) {
+ return (i->num_matches < j->num_matches);
+}
+
+uint16_t Seed::serialize_size() {
+ // calculate total size
+ uint16_t total_size = 0;
+
+ total_size += sizeof(GenomeIdType); // the target genome ID
+ total_size += sizeof(PositionType); // the start position
+ total_size += sizeof(CountType); // the number of matching positions
+
+ if (cigar_data.size() >= 256)
+ throw std::overflow_error("CIGAR information contains more than 255 elements!");
+ uint8_t cigar_len = cigar_data.size();
+ total_size += sizeof(uint8_t); // the size of the cigar information
+ total_size += cigar_len*(sizeof(CountType) + sizeof(DiffType)); // the cigar information itself
+
+ return total_size;
+}
+
+std::vector<char> Seed::serialize() {
+ // get the total size of the serialization
+ uint16_t total_size = serialize_size();
+ uint8_t cigar_len = (uint8_t) cigar_data.size();
+
+ // create the vector to store the data
+ std::vector<char> data (total_size);
+ char* d = data.data();
+
+ // write the target Genome ID
+ memcpy(d,&gid,sizeof(GenomeIdType));
+ d += sizeof(GenomeIdType);
+
+ // write the start position
+ memcpy(d,&start_pos,sizeof(PositionType));
+ d += sizeof(PositionType);
+
+ // write the number of matches
+ memcpy(d,&num_matches,sizeof(CountType));
+ d += sizeof(CountType);
+
+ // write the number of cigar elements
+ memcpy(d,&cigar_len,sizeof(uint8_t));
+ d += sizeof(uint8_t);
+
+ // write the seeds
+ for (auto it = cigar_data.begin(); it != cigar_data.end(); ++it) {
+ memcpy(d,&(it->length),sizeof(CountType));
+ d += sizeof(CountType);
+
+ memcpy(d,&(it->offset),sizeof(DiffType));
+ d += sizeof(DiffType);
+ }
+
+ return data;
+}
+
+
+
+uint16_t Seed::deserialize(char* d) {
+
+ // the total number of bytes read
+ uint16_t bytes = 0;
+
+ // read the target Genome ID
+ memcpy(&gid,d,sizeof(GenomeIdType));
+ bytes += sizeof(GenomeIdType);
+
+ // read the start position
+ memcpy(&start_pos,d+bytes,sizeof(PositionType));
+ bytes += sizeof(PositionType);
+
+ // read the number of matches
+ memcpy(&num_matches,d+bytes,sizeof(CountType));
+ bytes += sizeof(CountType);
+
+ // read the number of cigar elements
+ uint8_t cigar_len = 0;
+ memcpy(&cigar_len,d+bytes,sizeof(uint8_t));
+ bytes += sizeof(uint8_t);
+
+ // read the cigar elements
+ cigar_data.clear();
+ cigar_data.reserve(cigar_len);
+ for (uint8_t i = 0; i < cigar_len; ++i) {
+ CigarElement cig;
+ memcpy(&(cig.length),d+bytes,sizeof(CountType));
+ bytes += sizeof(CountType);
+
+ memcpy(&(cig.offset),d+bytes,sizeof(DiffType));
+ bytes += sizeof(DiffType);
+
+ cigar_data.push_back(cig);
+ }
+
+ return bytes;
+}
+
+
+
+
+
+
+uint64_t ReadAlignment::serialize_size() {
+ // calculate total size first
+ uint64_t total_size = 0;
+ total_size += 1; // the flag
+ total_size += sizeof(HashIntoType); // the last k-mer
+ total_size += sizeof(CountType); // the cycle number
+ total_size += sizeof(CountType); // the last_invalid cycle
+
+ // total number of seeds
+ total_size += sizeof(uint32_t);
+
+ // size of the single seeds
+ for (auto & s : seeds) {
+ total_size += sizeof(uint16_t) + s->serialize_size();
+ }
+
+ return total_size;
+}
+
+std::vector<char> ReadAlignment::serialize() {
+ // get the total size of the serialization
+ uint64_t total_size = serialize_size();
+ uint32_t num_seeds = (uint32_t) seeds.size();
+
+ // create the vector to store the data
+ std::vector<char> data (total_size);
+ char* d = data.data();
+
+ // write the flag
+ memcpy(d,&flags,1);
+ d++;
+
+ // write the last k-mer
+ memcpy(d,&last_kmer,sizeof(HashIntoType));
+ d += sizeof(HashIntoType);
+
+ // write the cycle
+ memcpy(d,&cycle,sizeof(CountType));
+ d += sizeof(CountType);
+
+ // write the last invalid cycle
+ memcpy(d,&last_invalid,sizeof(CountType));
+ d += sizeof(CountType);
+
+ // write the number of seeds
+ memcpy(d,&num_seeds,sizeof(uint32_t));
+ d += sizeof(uint32_t);
+
+ // write the seeds
+ for (auto it = seeds.begin(); it != seeds.end(); ++it) {
+ std::vector<char> seed_data = (*it)->serialize();
+ uint16_t seed_size = seed_data.size();
+
+ memcpy(d,&seed_size,sizeof(uint16_t));
+ d += sizeof(uint16_t);
+
+ memcpy(d,seed_data.data(),seed_size);
+ d += seed_size;
+ }
+
+ return data;
+}
+
+
+
+uint64_t ReadAlignment::deserialize(char* d) {
+ // the total number of bytes read
+ uint64_t bytes = 0;
+
+ // read the flag
+ memcpy(&flags,d,1);
+ bytes++;
+
+ // read the last k-mer
+ memcpy(&last_kmer,d+bytes,sizeof(HashIntoType));
+ bytes += sizeof(HashIntoType);
+
+ // read the cycle
+ memcpy(&cycle,d+bytes,sizeof(CountType));
+ bytes += sizeof(CountType);
+
+ // read the last invalid cycle
+ memcpy(&last_invalid,d+bytes,sizeof(CountType));
+ bytes += sizeof(CountType);
+
+ // read the number of seeds
+ uint32_t num_seeds = 0;
+ memcpy(&num_seeds,d+bytes,sizeof(uint32_t));
+ bytes += sizeof(uint32_t);
+
+ // read the seeds
+ seeds.clear();
+ seeds.reserve(num_seeds);
+ for (uint32_t i = 0; i < num_seeds; ++i) {
+ uint16_t seed_size = 0;
+ memcpy(&seed_size,d+bytes,sizeof(uint16_t));
+ bytes += sizeof(uint16_t);
+
+ std::vector<char> seed_data (seed_size,0);
+ memcpy(seed_data.data(),d+bytes,seed_size);
+ bytes += seed_size;
+
+ USeed s (new Seed);
+ s->deserialize(seed_data.data());
+
+ seeds.push_back(std::move(s));
+ }
+
+ return bytes;
+}
+
+
+
+// Create new seeds from a list of kmer positions and add to current seeds
+void ReadAlignment::add_new_seeds(GenomePosListType& pos) {
+
+ seeds.reserve(seeds.size() + pos.size());
+
+ for(GenomePosListIt it = pos.begin(); it != pos.end(); ++it) {
+
+ USeed s (new Seed);
+
+ s->gid = it->gid;
+ s->start_pos = it->pos - (cycle-K);
+ s->num_matches = 1;
+ s->cigar_data.clear();
+ if (cycle-K > 0)
+ s->cigar_data.emplace_back(cycle-K,NO_MATCH);
+ s->cigar_data.emplace_back(K,0);
+
+ seeds.push_back(std::move(s));
+
+ }
+
+}
+
+
+
+std::vector<std::chrono::high_resolution_clock::duration> ReadAlignment::extend_alignment(char bc, KixRun* index, AlignmentSettings* settings) {
+ // time stuff
+ std::chrono::high_resolution_clock::duration d_vec = std::chrono::high_resolution_clock::duration::zero();
+ std::chrono::high_resolution_clock::duration d_seed = std::chrono::high_resolution_clock::duration::zero();
+ std::chrono::high_resolution_clock::duration d_add = std::chrono::high_resolution_clock::duration::zero();
+ std::chrono::high_resolution_clock::duration d_rem = std::chrono::high_resolution_clock::duration::zero();
+ std::chrono::high_resolution_clock::duration d_sort = std::chrono::high_resolution_clock::duration::zero();
+
+ // q-gram-lemma: seeds added after last_new_seed have at least more than min_errors errors
+ CountType last_new_seed = K*(settings->min_errors+1);
+
+ // move to the next cycle
+ cycle += 1;
+
+ // update the last k-mer
+ uint8_t qual = ((bc >> 2) & 63); // get bits 3-8
+ if ( (bc == 0) || (qual < settings->min_qual) ){ // no call if all 0 bits or quality below threshold
+ last_kmer = 0;
+ last_invalid = cycle;
+ }
+ else{
+ // update the current k-mer of the read using the basecall
+ update_kmer(last_kmer, bc);
+ }
+
+ // update the alignments
+ if ( last_invalid+K-1 < cycle ) {
+
+ std::chrono::high_resolution_clock::time_point tv1 = std::chrono::high_resolution_clock::now();
+ // get all occurrences of last_kmer (fwd & rc) from index
+ GenomePosListType pos = index->retrieve_positions(last_kmer);
+
+ // pos MUST be sorted. However, pos is sorted as long as the index is sorted (should be by default)
+ if (settings->sort_positions) {
+ std::sort(pos.begin(),pos.end(),gp_compare);
+ }
+
+ std::chrono::high_resolution_clock::time_point tv2 = std::chrono::high_resolution_clock::now();
+ d_vec = tv2 - tv1;
+
+
+ std::chrono::high_resolution_clock::time_point tl1 = std::chrono::high_resolution_clock::now();
+
+ CountType max_num_matches = 0;
+
+ // check if the current k-mer was trimmed in the index
+ if ( (pos.size() == 1) && (pos[0].gid == TRIMMED) ) {
+ // clear the pos list such that nothing bad happens in the next steps
+ pos.clear();
+
+ // pretend that all existing seeds could be extended
+ for (auto sd = seeds.begin() ; sd!=seeds.end(); ++sd ) {
+ (*sd)->cigar_data.back().length += 1;
+ if ((*sd)->cigar_data.back().offset != NO_MATCH) {
+ (*sd)->num_matches += 1;
+ max_num_matches = std::max(max_num_matches, (*sd)->num_matches);
+ }
+ }
+ }
+ // not trimmed in the index --> try to extend existing seeds
+ else {
+ // find support for each candidate: iterate over seed candidates and positions simultaneously
+ auto cPos1 = pos.begin(); // beginning of the sliding window [cPos1, cPos2)
+ auto cPos2 = pos.begin(); // end of the sliding window
+ auto wSeed1 = seeds.begin(); // beginning of the sliding window in the seeds [wSeed1, wSeed2)
+ auto wSeed2 = seeds.begin(); // end of the sliding window
+
+ for (auto cSeed = seeds.begin(); cSeed!=seeds.end(); ++cSeed ) {
+ PositionType seed_pos = (*cSeed)->start_pos + cycle -K;
+
+ // adjust the window in the position list
+ while( (cPos1!=pos.end()) && (cPos1->pos < seed_pos - settings->window) ){
+ ++cPos1;
+ }
+ while( (cPos2!=pos.end()) && (cPos2->pos < seed_pos + settings->window) ){
+ ++cPos2;
+ }
+
+ // adjust the neighboring seeds window
+ while( (wSeed1!=seeds.end()) && ((*wSeed1)->start_pos < (*cSeed)->start_pos - 2*settings->window) ){
+ ++wSeed1;
+ }
+ while( (wSeed2!=seeds.end()) && ((*wSeed2)->start_pos < (*cSeed)->start_pos + 2*settings->window) ){
+ ++wSeed2;
+ }
+
+ // search all positions in the window for the best matching extension of the seed
+ DiffType best_distance = settings->window+1; // set larger than search window
+ GenomePosListIt best_match = cPos2; // set behind the last element of the window
+ for(GenomePosListIt win=cPos1; win!=cPos2; ++win){
+ if (win->gid == (*cSeed)->gid){
+ int dist = seed_pos - win->pos;
+ if ((best_match==cPos2)||(abs(dist) < abs(best_distance))) {
+ best_match = win;
+ best_distance = dist;
+ }
+ }
+ }
+
+ // assign best position to the seed
+ if (best_match != cPos2) {
+ // find the best seed from the perspective of best_match
+ DiffType best_sdist = 2*settings->window+1; // set larger than search window
+ auto best_seed = wSeed2; // set behind the last element of the window
+ for(auto win=wSeed1; win!=wSeed2; ++win){
+ if ((*win)->gid == best_match->gid){
+ int dist = best_match->pos - ((*win)->start_pos+cycle-K);
+ if ((best_seed==wSeed2)||(abs(dist) < abs(best_sdist))) {
+ best_seed = win;
+ best_sdist = dist;
+ }
+ }
+ }
+
+ if (best_seed == cSeed) {
+ //(*cSeed)->matches.push_back(best_distance);
+ (*cSeed)->num_matches += 1;
+ if ( (*cSeed)->cigar_data.back().offset == NO_MATCH ) {
+ // start a new match area. 1 matching k-mer = K matches
+ (*cSeed)->cigar_data.emplace_back(K,best_distance);
+ }
+ else {
+ // continue existing match area
+ (*cSeed)->cigar_data.back().length += 1;
+ }
+ max_num_matches = std::max(max_num_matches, (*cSeed)->num_matches);
+ // remove assigned position from the list
+ if(best_match == cPos1){
+ cPos1 = pos.erase(best_match);
+ }
+ else {
+ pos.erase(best_match);
+ }
+ --cPos2; //
+ }
+ else{
+ // best match has another favourite
+ if ( (*cSeed)->cigar_data.back().offset == NO_MATCH ) {
+ // continue existing mismatch area
+ (*cSeed)->cigar_data.back().length += 1;
+ }
+ else {
+ // start new mismatch area
+ (*cSeed)->cigar_data.emplace_back(1,NO_MATCH);
+ }
+ //(*cSeed)->matches.push_back(NO_MATCH);
+ }
+ }
+ else{
+ // no position found to extend the current seed
+ if ( (*cSeed)->cigar_data.back().offset == NO_MATCH ) {
+ // continue existing mismatch area
+ (*cSeed)->cigar_data.back().length += 1;
+ }
+ else {
+ // start new mismatch area
+ (*cSeed)->cigar_data.emplace_back(1,NO_MATCH);
+ }
+ //(*cSeed)->matches.push_back(NO_MATCH);
+ }
+ } // END: for(seeds...)
+ } // END: not trimmed
+ std::chrono::high_resolution_clock::time_point tl2 = std::chrono::high_resolution_clock::now();
+ d_seed = tl2 - tl1;
+
+
+ std::chrono::high_resolution_clock::time_point ta1 = std::chrono::high_resolution_clock::now();
+ // set the last_new_seed cycle according to the mapping mode
+ if ( settings->best_hit_mode ) {
+ last_new_seed = std::min(CountType(rlen-max_num_matches+1),last_new_seed);
+ }
+ if ( settings->best_n_mode ) {
+ //TODO: last_new_seed = std::min(CountType(rlen-max_num_matches+1),last_new_seed);
+ }
+
+ // create new seed candidates for each k-mer match that was not used to extend a seed
+ if ( cycle <= last_new_seed ) {
+
+ add_new_seeds(pos);
+
+ if (pos.begin() != pos.end()) {
+ max_num_matches = std::max(max_num_matches, (CountType)1);
+ }
+
+ }
+ std::chrono::high_resolution_clock::time_point ta2 = std::chrono::high_resolution_clock::now();
+ d_add = ta2 - ta1;
+
+
+ std::chrono::high_resolution_clock::time_point tr1 = std::chrono::high_resolution_clock::now();
+
+ // get the num_matches of the N'th best seed
+ // seed list gets partially sorted (and sorted back later). Noone ever said it will be fast...
+ CountType nth_best_match = 0;
+ if ( settings->best_n_mode && (settings->best_n > 0) && (seeds.size() >= settings->best_n) ) {
+ std::nth_element(seeds.begin(), seeds.begin()+seeds.size()-settings->best_n , seeds.end(), seed_compare_num_matches);
+ nth_best_match = (*(seeds.begin()+seeds.size()-settings->best_n))->num_matches;
+ }
+
+ // define a lambda function implementing all discard criteria.
+ // all criteria must be fulfilled to keep the seed. Go from stronger to weaker criteria.
+ auto crit = [&] (USeed & s) {
+ // don't filter seeds that were extended in this cycle
+ if (s->cigar_data.back().offset != NO_MATCH) {
+ return false;
+ }
+
+ // 1. remove one-hit-wonders
+ if ( settings->discard_ohw && (cycle>settings->start_ohw)&&(s->num_matches<=1) ) {
+ return true;
+ }
+
+ // 2. remove according to q-gram lemma
+ if ( cycle > (K*(settings->min_errors+1) + s->num_matches) ) {
+ return true;
+ }
+
+ // 3. remove according Best-Hit-criteria
+ if ( settings->best_hit_mode ) {
+ if (cycle > (rlen - max_num_matches + s->num_matches)) {
+ return true;
+ }
+ }
+ // 4. remove according Best-N-criteria
+ else if ( settings->best_n_mode ) {
+ if ( cycle > (rlen - nth_best_match + s->num_matches) ) {
+ return true;
+ }
+ }
+
+ // get the number of mismatches since the last match
+ int since_last_match = s->cigar_data.back().length;
+
+ // 5. heuristic criterium
+ if ((since_last_match >= K+10)&&(s->num_matches < (int)(std::sqrt(cycle-K+1)))){
+ return true;
+ }
+
+ return false;
+ };
+
+ seeds.erase(std::remove_if(seeds.begin(),seeds.end(),crit) , seeds.end());
+ std::chrono::high_resolution_clock::time_point tr2 = std::chrono::high_resolution_clock::now();
+ d_rem = tr2 - tr1;
+
+
+
+ std::chrono::high_resolution_clock::time_point tso1 = std::chrono::high_resolution_clock::now();
+ std::sort(seeds.begin(), seeds.end(), seed_compare_pos);
+ std::chrono::high_resolution_clock::time_point tso2 = std::chrono::high_resolution_clock::now();
+ d_sort = tso2 - tso1;
+
+ } // END: if ( last_invalid+K-1 < cycle ) ...
+ else {
+
+ // write a NO_MATCH if cycle > K-1
+ if ( cycle > K-1 ) {
+ for (auto sit = seeds.begin(); sit != seeds.end(); ++sit){
+ //(*sit)->matches.push_back(NO_MATCH);
+ if ( (*sit)->cigar_data.back().offset == NO_MATCH ) {
+ // continue existing mismatch area
+ (*sit)->cigar_data.back().length += 1;
+ }
+ else {
+ // start new mismatch area
+ (*sit)->cigar_data.emplace_back(1,NO_MATCH);
+ }
+ }
+ }
+
+ }
+
+ std::vector<std::chrono::high_resolution_clock::duration> dur;
+ dur.push_back(d_vec);
+ dur.push_back(d_seed);
+ dur.push_back(d_add);
+ dur.push_back(d_rem);
+ dur.push_back(d_sort);
+ return dur;
+}
+
+
+
+// disable this alignment, i.e. delete all seeds and set the last_invalid indicator to the
+// end of the read. --> This read will not be aligned and consumes almost no space.
+void ReadAlignment::disable() {
+ last_invalid = rlen;
+ seeds.clear();
+ flags = 0;
+}
+
+
+// generate the SAM flags for a seed
+uint32_t ReadAlignment::get_SAM_flags(uint32_t sd) {
+ if ( sd < seeds.size() ) {
+ uint32_t flags = 0;
+ // flag for the reverse strand alignment
+ if (seeds[sd]->start_pos < 0)
+ flags += 16;
+
+ // Primary/Secondary alignment
+ // the "Primary" alignment is the alignment with the highest number of matches
+ // in case of a tie, the left-most alignment wins (including negative positions)
+ bool primary = true;
+ for(uint32_t i = 0; i < seeds.size(); i++) {
+ if(seeds[i]->num_matches > seeds[sd]->num_matches || (seeds[i]->num_matches == seeds[sd]->num_matches && i < sd)) {
+ primary = false;
+ }
+ }
+ if (!primary)
+ flags += 256;
+ return flags;
+ }
+ else {
+ throw std::length_error(std::string("Error generating SAM flags: requested alignment ID (")+std::to_string(sd)+std::string(") exceeds number of alignments (")+std::to_string(seeds.size())+std::string(")!"));
+ }
+}
+
+
+// obtain start position of a seed according to SAM (leftmost)
+PositionType ReadAlignment::get_SAM_start_pos(uint32_t sd) {
+ PositionType pos = seeds[sd]->start_pos;
+ if (pos < 0) {
+ pos = -pos - rlen + K;
+ }
+ return pos;
+}
+
+
+// calculate a quality score according to SAM
+uint16_t ReadAlignment::get_SAM_quality(uint32_t sd) {
+ CountType best_match = 0;
+ for (uint32_t s = 0; s < seeds.size(); s++) {
+ best_match = std::max(seeds[s]->num_matches, best_match);
+ }
+ double total_weighted_matches = 0.;
+ for (uint32_t s = 0; s < seeds.size(); s++) {
+ total_weighted_matches = std::pow(4., double(seeds[s]->num_matches - best_match));
+ }
+
+ double prob = std::pow(4., double(seeds[sd]->num_matches - best_match)) / total_weighted_matches;
+ int score = -10 * std::log10(prob);
+ if (score > 255 || score < 0)
+ score = 255;
+ return score;
+}
+
+
+
+
+
+
+
+
+
+
+/* Calculate the mapping quality for all alignments of the read based on the other alignments
+ and the number of matching positions.
+ */
+
+int16_t MAPQ(const SeedVec &sv){
+ return sv.size();
+ }
+
+
+/* Helper function; checks if the position at 'sit' is sane. only applicable for reads matching
+ to the backwards genome sequence (forward sanity check is MUCH easier) */
+bool is_sane(std::vector<DiffType>::iterator sit,
+ const std::vector<DiffType> &matches){
+ DiffType dist = 0;
+ DiffType this_offset = *sit;
+ // find the OFFSET of the next matching region (after the next NOMATCH region)
+ bool passed_NOMATCH = false;
+ while(!((*sit!=NO_MATCH)&&passed_NOMATCH) && (sit!=matches.end())){
+ if(*sit == NO_MATCH) {passed_NOMATCH = true;}
+ ++sit;
+ ++dist;
+ }
+
+ if (sit == matches.end()) {
+ return true;
+ }
+ else {
+ int offset_change = *sit - this_offset;
+ return ((dist-K+1 - offset_change*(offset_change>0)) > 0);
+ }
+}
+
+/* Construct a CIGAR string from a seed */
+std::string CIGAR(const Seed &seed){
+ // Alignments are always reported in forward direction wrt the reference sequence --> take care
+ bool fw = (seed.start_pos >= 0); // Read was mapped to the forward sequence
+ CigarVector cig;
+ if (fw) {
+ cig = seed.cigar_data;
+ }
+ else {
+ // need to adjust the offsets --> find last offset != NO_MATCH
+ auto rit = seed.cigar_data.rbegin();
+ while((rit != seed.cigar_data.rend()) && (rit->offset == NO_MATCH)) {
+ ++rit;
+ }
+ int loffset;
+ if (rit != seed.cigar_data.rend())
+ loffset = rit->offset;
+ else
+ loffset = 0;
+ cig.reserve(seed.cigar_data.size());
+ // reverse direction of 'cigar_data' and adjust offsets
+ for(rit = seed.cigar_data.rbegin(); rit != seed.cigar_data.rend(); ++rit) {
+ if (rit->offset != NO_MATCH)
+ cig.emplace_back(rit->length,loffset - rit->offset);
+ else
+ cig.emplace_back(rit->length,NO_MATCH);
+ }
+ }
+
+ // Now, calculate the CIGAR string
+ std::string cigar;
+ if (cig.size() > 0) {
+ int previous_offset = 0; // the offset in the previous matching region
+ for (uint32_t i = 0; i < cig.size(); i++) {
+ // extend the cigar string accordingly
+ if ( cig[i].offset != NO_MATCH ) {
+ cigar.append(std::to_string(cig[i].length)+std::string("M"));
+ previous_offset = cig[i].offset;
+ }
+ else {
+ int offset_change = 0;
+ int mm_len = cig[i].length -K+1;
+ // correct the length of the mismatch region if there are insane positions
+ if ( i+1 < cig.size() ) {
+ offset_change = cig[i+1].offset - previous_offset;
+ if (offset_change > mm_len) {
+ // offset change cannot be larger than the mismatch area
+ cig[i].length += (offset_change - mm_len);
+ cig[i+1].length -= (offset_change - mm_len);
+ mm_len = cig[i].length -K+1;
+ }
+ }
+
+ if ( offset_change == 0 ) {
+ // Sequence mismatch(es)
+ if ( i > 0 && i < cig.size()-1 ) {
+ cigar.append(std::to_string(cig[i].length -K+1)+std::string("X"));
+ }
+ else {
+ // report as soft-clipped at the beginning/end of the read
+ cigar.append(std::to_string(cig[i].length)+std::string("S"));
+ }
+ }
+ else {
+ // Number of insertions
+ int insertions = cig[i].length -K+1;
+ // Number of deletions
+ int deletions = - (offset_change - insertions);
+ // append to cigar string
+ if ( insertions > 0 )
+ cigar.append(std::to_string(insertions)+std::string("I"));
+ if ( deletions > 0 )
+ cigar.append(std::to_string(deletions)+std::string("D"));
+ }
+ }
+ }
+ }
+
+ /*int count = K-1; // number of matches/mismatches in a row
+ bool previous_match = true; // was the previous k-mer a match? -> recognize changes
+ // change initial values if the read starts with a mismatch
+ if(*(matches.begin()) == NO_MATCH) {
+ count = K-1;
+ previous_match = false;
+ }
+ int previous_offset = 0; // the offset in the previous matching region
+ int pos = 0; // current position in the read
+ for(auto it = matches.begin(); it != matches.end(); ++it){
+ if ((*it) != NO_MATCH) {
+ // The k-mer could be matched
+ if (previous_match) {
+ // Check the sanity of this match, if found on backward seq
+ bool sane = true;
+ if (!fw){
+ sane = is_sane(it, matches);
+ }
+ if (sane) {
+ // Continue a match
+ count += 1;
+ previous_offset = (*it);
+ }
+ else {
+ // end the match here, treat this position as mismatch
+ //std::cout << "INSANE position!!!" << std::endl;
+ // start a new mismatch area here. Don't care about countinuing, this happens below
+ // Finish a match region
+ cigar.append(std::to_string(count)+std::string("M"));
+ // Start a new mismatch region
+ count = 1;
+ previous_match = false;
+ }
+ }
+ // the previous k-mer was not matched
+ else {
+ int offset_change = (*it) - previous_offset;
+ // check the sanity of the match
+ bool sane = ((count-K+1 - offset_change*(offset_change>0)) >= 0) && fw;
+ // add the criterium for backward matches here (as above)
+ sane = sane || (!fw && is_sane(it, matches));
+ if (sane) {
+ // Finish a mismatch region
+ if (offset_change == 0) {
+ if(count > K-1){
+ // Sequence mismatch(es)
+ cigar.append(std::to_string(count-K+1)+std::string("X"));
+ }
+ }
+ else {
+ // cure only on forward mapping reads
+ if (count > K-1){
+ // Insertion compared to reference
+ cigar.append(std::to_string(count-K+1)+std::string("I"));
+ }
+ if (count-offset_change > K-1){
+ // In this case, a string of length count-offset_change was deleted from reference
+ cigar.append(std::to_string(count-offset_change-K+1)+std::string("D"));
+ }
+ }
+ // Start a new match region. The first k-mer represents K matching positions
+ count = K;
+ previous_match = true;
+ previous_offset = (*it);
+ }
+ else {
+ // insane match --> treat as mismatch
+ count += 1;
+ previous_match = false;
+ }
+ }
+ }
+ else {
+ // The k-mer could NOT be matched
+ if (!previous_match) {
+ // Continue a mismatch
+ count += 1;
+ } else {
+ // Finish a match region
+ cigar.append(std::to_string(count)+std::string("M"));
+ // Start a new mismatch region
+ count = 1;
+ previous_match = false;
+ }
+ }
+ // increase the position counter
+ pos++;
+ }
+ // Complete the cigar string
+ if (previous_match) {
+ // Fill cigar string with matches
+ cigar.append(std::to_string(count)+std::string("M"));
+ }
+ else {
+ // Fill cigar string with mismatches
+ // TODO: probably report this as "soft-clipping"
+ cigar.append(std::to_string(count)+std::string("X"));
+ }
+ }*/
+ return cigar;
+}
diff --git a/lib/alnread.h b/lib/alnread.h
new file mode 100644
index 0000000..e16bede
--- /dev/null
+++ b/lib/alnread.h
@@ -0,0 +1,135 @@
+#ifndef ALNREAD_H
+#define ALNREAD_H
+
+#include <chrono>
+
+#include "headers.h"
+#include "definitions.h"
+#include "kindex.h"
+#include "tools.h"
+
+
+//-------------------------------------------------------------------//
+//------ The Seed data structure ----------------------------------//
+//-------------------------------------------------------------------//
+
+struct CigarElement {
+ CountType length;
+ DiffType offset;
+ CigarElement (CountType l, DiffType o): length(l), offset(o) {};
+ CigarElement (): length(0), offset(NO_MATCH) {};
+};
+
+
+typedef std::vector<CigarElement> CigarVector;
+
+// a Seed stores the alignment of a read to a target genome
+struct Seed {
+ // internal sequence ID of taget genome
+ GenomeIdType gid;
+ // (estimated) start position of the read on the target
+ PositionType start_pos;
+ // number of matches
+ CountType num_matches;
+
+ // list of all observed matches
+ //std::vector<DiffType> matches;
+ // Information about matches/mismatches (similar to CIGAR). The last element is the current one
+ CigarVector cigar_data;
+
+ // get the size of the serialized object
+ uint16_t serialize_size();
+
+ // serialize the object
+ std::vector<char> serialize();
+
+ // deserialize (read) data from a char vector
+ uint16_t deserialize(char* d);
+};
+
+typedef std::unique_ptr<Seed> USeed;
+
+// compare function to sort Seed objects by position
+bool seed_compare (Seed i,Seed j);
+bool seed_compare_pos (const USeed & i, const USeed & j);
+bool seed_compare_num_matches (const USeed & i, const USeed & j);
+
+// std::vector of Seed pointers is much faster
+typedef std::vector<USeed> SeedVec;
+// a SeedVec Iterator
+typedef SeedVec::iterator SeedVecIt;
+
+
+
+//-------------------------------------------------------------------//
+//------ The Read-Alignment class ---------------------------------//
+//-------------------------------------------------------------------//
+
+class ReadAlignment {
+ private:
+ // read length
+ const CountType rlen;
+
+ // Create new seeds from a list of kmer positions and add to current seeds
+ void add_new_seeds(GenomePosListType& pos);
+
+
+ public: // have everything public until the apropriate access functions are available
+
+ // Flags for this read; 1 = read is valid (illumina flag)
+ unsigned char flags;
+
+ // the k-mer value observed in the last cycle
+ HashIntoType last_kmer;
+
+ // the last invalid cycle
+ CountType last_invalid;
+
+ // the current cycle
+ CountType cycle;
+
+ // a list of all found seeds
+ SeedVec seeds;
+
+ // Create a new read alignment given a certain read length
+ ReadAlignment(CountType rl): rlen(rl), last_kmer(0), last_invalid(0), cycle(0) {seeds.clear();};
+
+ // get the size of the serialized object
+ uint64_t serialize_size();
+
+ // serialize the object
+ std::vector<char> serialize();
+
+ // deserialize (read) data from a char vector
+ uint64_t deserialize(char* d);
+
+ // extend the alignment by one basecall using reference database index
+ std::vector<std::chrono::high_resolution_clock::duration> extend_alignment(char bc, KixRun* index, AlignmentSettings* settings);
+
+ // disable this alignment
+ void disable();
+
+ // generate the SAM flags for a seed
+ uint32_t get_SAM_flags(uint32_t sd);
+
+ // obtain start position of a seed according to SAM (leftmost)
+ PositionType get_SAM_start_pos(uint32_t sd);
+
+ // calculate a quality score according to SAM
+ uint16_t get_SAM_quality(uint32_t sd);
+
+}; // END class ReadAlignment
+
+
+
+
+
+
+//------ Other helper functions -----------------------------------//
+//CountType num_matches(const std::vector<DiffType> &matches);
+int16_t MAPQ(const SeedVec &sv);
+std::string CIGAR(const Seed &seed);
+
+
+
+#endif /* ALNREAD_H */
diff --git a/lib/alnstream.cpp b/lib/alnstream.cpp
new file mode 100644
index 0000000..0f969ba
--- /dev/null
+++ b/lib/alnstream.cpp
@@ -0,0 +1,727 @@
+#include "alnstream.h"
+
+
+//-------------------------------------------------------------------//
+//------ The output Alignment Stream class ------------------------//
+//-------------------------------------------------------------------//
+
+// new output Alignment Stream class
+oAlnStream::oAlnStream(uint16_t ln, uint16_t tl, uint16_t cl, std::string rt, CountType rl, uint32_t nr, uint64_t bs, uint8_t fmt):
+ lane(ln), tile(tl), cycle(cl), root(rt), rlen(rl), num_reads(nr), num_written(0), buffer(bs,0), buf_size(bs), buf_pos(0), format(fmt), ofile(NULL), ozfile(Z_NULL) {}
+
+
+// write function for lz4 compression
+uint64_t oAlnStream::lz4write(const char* source, uint64_t size) {
+ // allocate buffer for the compressed data
+ std::vector<char> buf (LZ4_COMPRESSBOUND(size),0);
+
+ // compress the data
+ uint32_t compressed_size = LZ4_compress (source, buf.data(), size);
+ if (!compressed_size)
+ throw std::runtime_error("Error compressing data with LZ4.");
+
+ // write the block size
+ if ( !fwrite(&compressed_size, 1, sizeof(uint32_t), ofile) )
+ throw std::runtime_error("Error writing block size to file while compressing data with LZ4.");
+
+ // write the data chunk
+ if ( !fwrite(buf.data(), 1, compressed_size, ofile) )
+ throw std::runtime_error("Error writing data to file while compressing with LZ4.");
+
+ return size;
+}
+
+
+uint64_t oAlnStream::open(std::string fname) {
+ // open the new Alignment file
+ switch (format) {
+ case 0: case 2:
+ ofile = fopen(fname.c_str(), "wb");
+ if (!ofile) {
+ std::cerr << "Could not open file " << fname << " for writing." << std::endl;
+ return 0;
+ }
+ break;
+ case 1:
+ ozfile = gzopen(fname.c_str(), "wb1"); //Don't compress too much, not enough bang for the buck
+ if (ozfile == Z_NULL) {
+ std::cerr << "Could not open file " << fname << " for writing." << std::endl;
+ return 0;
+ }
+ break;
+ default:
+ throw std::invalid_argument("Output file format not recognized");
+ }
+
+ // write the header:
+
+ // calculate total size first
+ unsigned long int total_size = 0;
+
+ total_size += sizeof(uint16_t); // lane
+ total_size += sizeof(uint16_t); // tile
+ total_size += sizeof(CountType); // cycle
+
+ // root directory name
+ uint16_t root_size = root.size();
+ total_size += sizeof(uint16_t);
+ total_size += root_size;
+
+ // read length
+ total_size += sizeof(CountType);
+
+ // number of reads
+ total_size += sizeof(uint32_t);
+
+ // create the vector to store the data
+ std::vector<char> data (total_size);
+ char* d = data.data();
+
+ // write the lane
+ memcpy(d,&lane,sizeof(uint16_t));
+ d += sizeof(uint16_t);
+
+ // write the tile
+ memcpy(d,&tile,sizeof(uint16_t));
+ d += sizeof(uint16_t);
+
+ // write the cycle
+ memcpy(d,&cycle,sizeof(CountType));
+ d += sizeof(CountType);
+
+ // root directory name
+ memcpy(d,&root_size,sizeof(uint16_t));
+ d += sizeof(uint16_t);
+
+ memcpy(d,root.c_str(),root_size);
+ d += root_size;
+
+ // write the read length
+ memcpy(d,&rlen,sizeof(CountType));
+ d += sizeof(CountType);
+
+ // write the number of reads
+ memcpy(d,&num_reads,sizeof(uint32_t));
+ d += sizeof(int32_t);
+
+ // write all data
+ uint64_t written = 0;
+ switch (format) {
+ case 0: case 2: written = fwrite(data.data(), 1, data.size(), ofile); break;
+ case 1: written = gzwrite(ozfile, data.data(), data.size()); break;
+ }
+
+ return written;
+}
+
+
+// writes a read alignment to the output Alignment file.
+// Buffering is handled internally
+uint64_t oAlnStream::write_alignment(ReadAlignment & al) {
+ if ( (!ofile && (format == 0 || format == 2)) || (ozfile == Z_NULL && format == 1) ){
+ throw std::runtime_error("Could not write alignment to file. File handle not valid.");
+ }
+ if (num_written >= num_reads) {
+ throw std::length_error("Could not write alignment to file. All alignments were already written.");
+ }
+
+ std::vector<char> data = al.serialize();
+ uint64_t al_size = data.size();
+
+ // first, write the size of the serialized alignment (uint32_t = 4 bytes)
+ if (buf_pos+sizeof(uint32_t) <= buf_size) {
+ // directly copy if all 4 bytes have space in the buffer (should be almost always the case)
+ memcpy(buffer.data()+buf_pos,&al_size,sizeof(uint32_t));
+ buf_pos += sizeof(uint32_t);
+ }
+ else {
+ // copy the first bytes into temporary buffer to compose the alignment size
+ std::vector<char> temp (sizeof(uint32_t),0);
+ memcpy(temp.data(),&al_size,sizeof(uint32_t));
+
+ uint64_t first_part = buf_size-buf_pos;
+ memcpy(buffer.data()+buf_pos,temp.data(),first_part);
+
+ // write out buffer
+ uint64_t written = 0;
+ switch (format) {
+ case 0: written = fwrite(buffer.data(), 1, buffer.size(), ofile); break;
+ case 1: written = gzwrite(ozfile, buffer.data(), buffer.size()); break;
+ case 2: written = lz4write(buffer.data(), buffer.size()); break;
+ }
+ assert(written == buf_size);
+
+ // copy remaining data
+ memcpy(buffer.data(),temp.data()+first_part,sizeof(uint32_t)-first_part);
+ buf_pos = sizeof(uint32_t)-first_part;
+ }
+
+ // finally, write the serialized data
+ uint64_t copied = 0;
+ while (copied < al_size) {
+ uint64_t to_copy = std::min(al_size-copied,buf_size-buf_pos);
+ memcpy(buffer.data()+buf_pos, data.data()+copied, to_copy);
+ buf_pos += to_copy;
+ copied += to_copy;
+
+ // write buffer to disk if full
+ if(buf_pos >= buf_size){
+ uint64_t written = 0;
+ switch (format) {
+ case 0: written = fwrite(buffer.data(), 1, buffer.size(), ofile); break;
+ case 1: written = gzwrite(ozfile, buffer.data(), buffer.size()); break;
+ case 2: written = lz4write(buffer.data(), buffer.size()); break;
+ }
+ assert(written == buf_size);
+ buf_pos = 0;
+ }
+ }
+
+ num_written++;
+
+ return num_written;
+}
+
+
+bool oAlnStream::close() {
+ if ( ((format == 0 || format == 2) && ofile) || (format == 1 && ozfile != Z_NULL) ) {
+ // write remaining buffer content to file
+ uint64_t written = 0;
+ switch (format) {
+ case 0: written = fwrite(buffer.data(), 1, buf_pos, ofile); break;
+ case 1: written = gzwrite(ozfile, buffer.data(), buf_pos); break;
+ case 2: written = lz4write(buffer.data(), buf_pos); break;
+ }
+ assert(written == buf_pos);
+ buf_pos = 0;
+ if (num_written == num_reads) {
+ switch (format) {
+ case 0: case 2: fclose(ofile); break;
+ case 1: gzclose(ozfile); break;
+ }
+ return true;
+ }
+ else {
+ std::cerr << "Error: Could not close output alignment file! "<< num_reads - num_written <<" alignments missing." << std::endl;
+ return false;
+ }
+ }
+ else {
+ std::cerr << "Error: Could not close output alignment file! File handle not valid." << std::endl;
+ return false;
+ }
+}
+
+
+
+//-------------------------------------------------------------------//
+//------ The input Alignment Stream class -------------------------//
+//-------------------------------------------------------------------//
+
+// new Alignment Stream class
+iAlnStream::iAlnStream(uint64_t bs, uint8_t fmt):
+ lane(0), tile(0), cycle(0), root(""), rlen(0), num_reads(0), num_loaded(0), buffer(bs,0), buf_size(bs), buf_pos(bs), format(fmt), ifile(NULL), izfile(Z_NULL) {}
+
+
+// read function for lz4 decompression, reads one block of data
+uint64_t iAlnStream::lz4read_block() {
+ // get the size of the next block
+ uint32_t compressed_size = 0;
+ if ( !fread(&compressed_size,sizeof(uint32_t),1,ifile) )
+ return 0;
+
+ // allocate buffer for the compressed data
+ std::vector<char> cbuf (compressed_size,0);
+
+ // read the data
+ if ( !fread(cbuf.data(),compressed_size,1,ifile) )
+ throw std::runtime_error("Malformed input file. Could not read next block.");
+
+ // decompress the data
+ int64_t r_size = LZ4_decompress_safe (cbuf.data(), buffer.data(), compressed_size, buffer.size());
+ if ( r_size < 0 )
+ throw std::runtime_error("Error while decompressing LZ4 compressed block.");
+
+ // update the current buffer size
+ buf_size = r_size;
+
+ return (uint64_t)r_size;
+}
+
+
+uint64_t iAlnStream::open(std::string fname) {
+ // open the new Alignment file
+ switch (format) {
+ case 0: case 2:
+ ifile = fopen(fname.c_str(), "rb");
+ if (!ifile) {
+ std::cerr << "Error opening file " << fname << " for reading." << std::endl;
+ return 0;
+ }
+ break;
+ case 1:
+ izfile = gzopen(fname.c_str(), "rb");
+ if (izfile == Z_NULL) {
+ std::cerr << "Error opening file " << fname << " for reading." << std::endl;
+ return 0;
+ }
+ break;
+ default:
+ throw std::invalid_argument("Input file format not recognized.");
+ }
+
+ // load the header:
+
+ uint64_t bytes = 0;
+ uint16_t root_size;
+ switch (format) {
+ case 0: case 2:
+ {
+ // read the lane
+ bytes += fread(&lane,sizeof(uint16_t),1,ifile);
+ // read the tile
+ bytes += fread(&tile,sizeof(uint16_t),1,ifile);
+ // read the cycle
+ bytes += fread(&cycle,sizeof(CountType),1,ifile);
+ // root directory name size
+ bytes += fread(&root_size,sizeof(uint16_t),1,ifile);
+ // root name
+ char tmp[root_size+1];
+ bytes += fread(tmp,1,root_size,ifile);
+ tmp[root_size] = 0; // make the string null-terminated
+ root = tmp;
+ // read the read length
+ bytes += fread(&rlen,sizeof(CountType),1,ifile);
+ // read the number of reads
+ bytes += fread(&num_reads,sizeof(uint32_t),1,ifile);
+ break;
+ }
+ case 1:
+ {
+ // read the lane
+ bytes += gzread(izfile,&lane,sizeof(uint16_t));
+ // read the tile
+ bytes += gzread(izfile,&tile,sizeof(uint16_t));
+ // read the cycle
+ bytes += gzread(izfile,&cycle,sizeof(CountType));
+ // root directory name size
+ bytes += gzread(izfile,&root_size,sizeof(uint16_t));
+ // root name
+ char tmp[root_size+1];
+ bytes += gzread(izfile,tmp,root_size);
+ tmp[root_size] = 0; // make the string null-terminated
+ root = tmp;
+ // read the read length
+ bytes += gzread(izfile,&rlen,sizeof(CountType));
+ // read the number of reads
+ bytes += gzread(izfile,&num_reads,sizeof(uint32_t));
+ break;
+ }
+ }
+
+ return bytes;
+}
+
+
+
+ReadAlignment iAlnStream::get_alignment() {
+
+ if ( (format==0 && !ifile) || (format==1 && izfile == Z_NULL) ){
+ throw std::runtime_error("Could not load alignment from file. File handle not valid.");
+ }
+ if (num_loaded >= num_reads) {
+ throw std::length_error("Could not load alignment from file. All alignments were already loaded.");
+ }
+
+ // first, get the size of the serialized alignment (uint32_t = 4 bytes)
+ uint32_t al_size = 0;
+ if (buf_pos+sizeof(uint32_t) <= buf_size) {
+ // directly copy if all 4 bytes are in the buffer (should be almost always the case)
+ memcpy(&al_size,buffer.data()+buf_pos,sizeof(uint32_t));
+ buf_pos += sizeof(uint32_t);
+ }
+ else {
+ // copy the first bytes into temporary buffer to compose the alignment size
+ std::vector<char> temp (sizeof(uint32_t),0);
+ uint64_t first_part = buf_size-buf_pos;
+ memcpy(temp.data(),buffer.data()+buf_pos,first_part);
+
+ // load new buffer
+ uint64_t loaded;
+ switch (format) {
+ case 0:
+ loaded = fread(buffer.data(),1,buf_size,ifile);
+ assert( (loaded == buf_size) || feof(ifile) );
+ break;
+ case 1:
+ loaded = gzread(izfile,buffer.data(),buf_size);
+ assert( (loaded == buf_size) || gzeof(izfile) );
+ break;
+ case 2:
+ loaded = lz4read_block();
+ assert( loaded>0 || feof(ifile) );
+ break;
+ }
+
+ // copy remaining data and copy to variable
+ memcpy(temp.data()+first_part,buffer.data(),sizeof(uint32_t)-first_part);
+ buf_pos = sizeof(uint32_t)-first_part;
+ memcpy(&al_size,temp.data(),sizeof(uint32_t));
+ }
+
+ // then, copy the content to the data vector
+ std::vector<char> data(al_size,0);
+ uint64_t copied = 0;
+ while (copied < al_size) {
+ uint64_t to_copy = std::min(al_size-copied,buf_size-buf_pos);
+ memcpy(data.data()+copied, buffer.data()+buf_pos, to_copy);
+ buf_pos += to_copy;
+ copied += to_copy;
+
+ // read new buffer from disk if necessary
+ if(buf_pos >= buf_size){
+ uint64_t loaded;
+ switch (format) {
+ case 0:
+ loaded = fread(buffer.data(),1,buf_size,ifile);
+ assert( (loaded == buf_size) || feof(ifile) );
+ break;
+ case 1:
+ loaded = gzread(izfile,buffer.data(),buf_size);
+ assert( (loaded == buf_size) || gzeof(izfile) );
+ break;
+ case 2:
+ loaded = lz4read_block();
+ assert( loaded>0 || feof(ifile) );
+ break;
+ }
+ buf_pos = 0;
+ }
+ }
+
+ // finally, deserialize the alignment
+ ReadAlignment ra (rlen);
+ ra.deserialize(data.data());
+
+ num_loaded++;
+
+ return ra;
+}
+
+
+bool iAlnStream::close() {
+ //if (ifile) {
+ if ( ((format==0 || format==2) && ifile) || (format==1 && izfile != Z_NULL)) {
+ if (num_loaded == num_reads) {
+ switch (format) {
+ case 0: case 2: fclose(ifile); break;
+ case 1: gzclose(izfile); break;
+ }
+ return true;
+ }
+ else {
+ std::cerr << "Error: Could not close alignment file! "<< num_reads - num_loaded <<" alignments missing." << std::endl;
+ return false;
+ }
+ }
+ else {
+ throw std::runtime_error("Could not close alignment file. File handle not valid.");
+ }
+}
+
+
+
+
+//-------------------------------------------------------------------//
+//------ The StreamedAlignment class ------------------------------//
+//-------------------------------------------------------------------//
+
+std::string StreamedAlignment::get_bcl_file(uint16_t cycle) {
+ std::ostringstream path_stream;
+
+ path_stream << root << "/L00" << lane << "/C" << cycle << ".1/s_"<< lane <<"_" << tile << ".bcl";
+ return path_stream.str();
+}
+
+
+// get the path to the alignment file. The alignment file is located in
+// <base>/L00<lane>/s_<lane>_<tile>.<cycle>.align
+// if base == "": base = root
+std::string StreamedAlignment::get_alignment_file(uint16_t cycle, std::string base){
+ if (base == "") {
+ base = root;
+ }
+ std::ostringstream path_stream;
+ path_stream << base << "/L00" << lane << "/s_"<< lane << "_" << tile << "." << cycle << ".align";
+ return path_stream.str();
+}
+
+
+std::string StreamedAlignment::get_filter_file() {
+ std::ostringstream path_stream;
+ path_stream << root << "/L00" << lane << "/s_"<< lane << "_" << tile << ".filter";
+ return path_stream.str();
+}
+
+
+// create directories required to store the alignment files (only if not stored in root)
+void StreamedAlignment::create_directories(AlignmentSettings* settings) {
+ std::ostringstream path_stream;
+ if (settings->temp_dir == "") {
+ path_stream << root;
+ }
+ else {
+ path_stream << settings->temp_dir;
+ }
+ path_stream << "/L00" << lane;
+
+ boost::filesystem::create_directories(path_stream.str());
+
+ if (settings->sam_dir != "") {
+ std::ostringstream sam_stream;
+ sam_stream << settings->sam_dir;
+ sam_stream << "/L00" << lane;
+
+ boost::filesystem::create_directories(sam_stream.str());
+ }
+}
+
+
+// initialize empty alignment. Creates alignment files for a virtual Cycle 0
+void StreamedAlignment::init_alignment(AlignmentSettings* settings) {
+ std::string out_fname = get_alignment_file(0, settings->temp_dir);
+
+ // get the number of reads in this tile by looking in the first bcl file
+ std::string first_cycle = get_bcl_file(1);
+
+ // extract the number of reads
+ uint32_t num_reads = num_reads_from_bcl(first_cycle);
+
+ // open output alignment stream
+ oAlnStream output (lane, tile, 0, root, rlen, num_reads, settings->block_size, settings->compression_format);
+ output.open(out_fname);
+
+ // write empty read alignments for each read
+ for (uint32_t i = 0; i < num_reads; ++i) {
+ ReadAlignment ra (rlen);
+ output.write_alignment(ra);
+ }
+
+ if(!output.close()) {
+ std::cerr << "Error: Could not create initial alignment file." << std::endl;
+ }
+}
+
+
+
+// extend an existing alignment from cycle <cycle-1> to <cycle>. returns the number of seeds
+uint64_t StreamedAlignment::extend_alignment(uint16_t cycle, KixRun* index, AlignmentSettings* settings) {
+
+ // 1. Open the input file
+ //-----------------------
+ std::string in_fname = get_alignment_file(cycle-1, settings->temp_dir);
+ std::string out_fname = get_alignment_file(cycle, settings->temp_dir);
+ std::string bcl_fname = get_bcl_file(cycle);
+ std::string filter_fname = get_filter_file();
+
+ iAlnStream input ( settings->block_size, settings->compression_format );
+ input.open(in_fname);
+ assert(input.get_cycle() == cycle-1);
+ assert(input.get_lane() == lane);
+ assert(input.get_tile() == tile);
+ assert(input.get_root() == root);
+ assert(input.get_rlen() == rlen);
+
+ uint32_t num_reads = input.get_num_reads();
+
+
+ // 2. Open output stream
+ //----------------------------------------------------------
+ oAlnStream output (lane, tile, cycle, root, rlen, num_reads, settings->block_size, settings->compression_format);
+ output.open(out_fname);
+
+
+
+ // 3. Read the full BCL file (this is not too much)
+ //-------------------------------------------------
+ BclParser basecalls;
+ basecalls.open(bcl_fname);
+
+ // extract the number of reads from the BCL file
+ uint32_t num_base_calls = basecalls.size();
+ assert(num_base_calls == num_reads);
+
+
+ // 4. Load the filter flags if filter file is available
+ // ----------------------------------------------------
+ FilterParser filters;
+ if (file_exists(filter_fname)) {
+ filters.open(filter_fname);
+ // extract the number of reads from the filter file
+ uint32_t num_reads_filter = filters.size();
+
+ if (num_reads != num_reads_filter){
+ std::string msg = std::string("Number of reads in filter file (") + std::to_string(num_reads_filter) + ") does not match the number of reads in the BCL file (" + std::to_string(num_reads) + ").";
+ throw std::length_error(msg.c_str());
+ }
+ }
+
+ // 5. Extend alignments 1 by 1
+ //-------------------------------------------------
+ uint64_t num_seeds = 0;
+ for (uint64_t i = 0; i < num_reads; ++i) {
+ ReadAlignment ra = input.get_alignment();
+ if (filters.size() > 0 && filters.has_next()) {
+ // filter file was found -> apply filter
+ if(filters.next()) {
+ ra.extend_alignment(basecalls.next(), index, settings);
+ num_seeds += ra.seeds.size();
+ }
+ else {
+ basecalls.next();
+ ra.disable();
+ }
+ }
+ // filter file was not found -> treat every alignment as valid
+ else {
+ ra.extend_alignment(basecalls.next(), index, settings);
+ num_seeds += ra.seeds.size();
+ }
+
+ output.write_alignment(ra);
+ }
+
+ // 6. Close files
+ //-------------------------------------------------
+ if (!(input.close() && output.close())) {
+ std::cerr << "Could not finish alignment!" << std::endl;
+ }
+
+ // 7. Delete old alignment file, if requested
+ //-------------------------------------------
+ if ( ! settings->keep_aln_files ) {
+ std::remove(in_fname.c_str());
+ }
+
+ return num_seeds;
+}
+
+
+
+
+
+//-------------------------------------------------------------------//
+//------ Streamed SAM generation -----------------------------------//
+//-------------------------------------------------------------------//
+
+uint64_t alignments_to_sam(uint16_t ln, uint16_t tl, std::string rt, CountType rl, KixRun* index, AlignmentSettings* settings) {
+ // set the file names
+ std::string temp;
+ if (settings->temp_dir == "") {
+ temp = rt;
+ }
+ else {
+ temp = settings->temp_dir;
+ }
+ std::string sam_dir;
+ if (settings->sam_dir == "") {
+ sam_dir = temp;
+ }
+ else {
+ sam_dir = settings->sam_dir;
+ }
+ std::string filter_fname = filter_name(rt, ln, tl);
+ std::string position_fname = position_name(rt, ln, tl);
+ std::string alignment_fname = alignment_name(temp, ln, tl, rl);
+ std::string sam_fname = sam_tile_name(sam_dir, ln, tl);
+
+ // check if files exist
+ if ( !file_exists(alignment_fname) ) {
+ throw std::runtime_error(std::string("Could not create SAM file. Alignment file not found: ")+ alignment_fname);
+ }
+ if ( !file_exists(filter_fname) ) {
+ std::cerr << "Could not find .filter file: " << filter_fname << std::endl;
+ }
+
+
+ // open the alignment file
+ iAlnStream input ( settings->block_size, settings->compression_format );
+ input.open(alignment_fname);
+ uint64_t num_reads = input.get_num_reads();
+
+ // open the filter file, if applicable
+ FilterParser filters;
+ if (file_exists(filter_fname)) {
+ filters.open(filter_fname);
+ // extract the number of reads from the filter file
+ uint32_t num_reads_filter = filters.size();
+
+ if (num_reads != num_reads_filter){
+ std::string msg = std::string("Number of reads in filter file (") + std::to_string(num_reads_filter) + ") does not match the number of reads in the BCL file (" + std::to_string(num_reads) + ").";
+ throw std::length_error(msg.c_str());
+ }
+ }
+
+
+ // open the positions file, if applicable
+
+
+ // open the SAM file and write header
+ std::ofstream samfile;
+ samfile.open(sam_fname);
+ samfile << index->get_SAM_header();
+
+ uint64_t num_alignments = 0;
+
+ // read alignments and write to SAM
+ for (uint64_t i = 0; i < num_reads; ++i) {
+ ReadAlignment ra = input.get_alignment();
+ // if either: filter file is open and all filter flags were loaded and the filter flag is > 0
+ // or: the filter file is not available
+ if ((filters.size()>0 && filters.next()) || filters.size() == 0) {
+ num_alignments += ra.seeds.size();
+ std::stringstream readname;
+ // Read name format <instrument‐name>:<run ID>:<flowcell ID>:<lane‐number>:<tile‐number>:<x‐pos>:<y‐pos>
+ //readname << "<instrument>:<run-ID>:<flowcell-ID>:" << ln << ":" << tl << ":<xpos>:<ypos>:" << i;
+ readname << "lane." << ln << "|tile." << tl << "|read." << i;
+ for (uint64_t s = 0; s < ra.seeds.size(); s++) {
+ std::stringstream aln;
+ // write the SAM compliant alignment information
+ // Read name
+ aln << readname.str() << "\t";
+ // Flag
+ aln << ra.get_SAM_flags(s) << "\t";
+ // Reference name
+ aln << index->get_name(ra.seeds[s]->gid) << "\t";
+ // Position
+ aln << ra.get_SAM_start_pos(s) << "\t";
+ // Mapping Quality
+ aln << ra.get_SAM_quality(s) << "\t";
+ // CIGAR string
+ Seed sd = *(ra.seeds[s]);
+ aln << CIGAR(sd) << "\t";
+ // RNEXT field is unavailable
+ aln << "*\t";
+ // PNEXT field is unavailable
+ aln << "0\t";
+ // TLEN field is unavailable
+ aln << "0\t";
+ // Sequence is only added on request
+ aln << "*\t";
+ // Qualities are only added on request
+ aln << "*\n";
+ samfile << aln.str();
+ }
+ }
+ }
+ samfile.close();
+
+ std::ofstream statsfile;
+ statsfile.open(sam_fname+".stats");
+ statsfile << "Number of reads\t" << num_reads << std::endl;
+ statsfile << "Number of alignments\t" << num_alignments << std::endl;
+ statsfile.close();
+
+ return 0;
+}
+
diff --git a/lib/alnstream.h b/lib/alnstream.h
new file mode 100644
index 0000000..5afd108
--- /dev/null
+++ b/lib/alnstream.h
@@ -0,0 +1,177 @@
+#ifndef ALNSTREAM_H
+#define ALNSTREAM_H
+
+#include "headers.h"
+#include "definitions.h"
+#include "kindex.h"
+#include "tools.h"
+#include "alnread.h"
+#include "illumina_parsers.h"
+
+// Output alignment stream: write alignments to file one by one
+class oAlnStream {
+ // dataset information for the header
+ uint16_t lane;
+ uint16_t tile;
+ uint16_t cycle;
+ std::string root;
+ CountType rlen;
+ uint32_t num_reads;
+
+ // number of reads written to file
+ uint32_t num_written;
+
+ // data buffer (don't write everything at once)
+ std::vector<char> buffer;
+
+ // size of the buffer
+ uint64_t buf_size;
+
+ // current position in the buffer
+ uint64_t buf_pos;
+
+ // output file format
+ // 0: no compression
+ // 1: zlib compression (level 1)
+ // 11: lz4 compression (level 1)
+ uint8_t format;
+
+ // file handles
+ FILE* ofile;
+ gzFile ozfile;
+
+ // write function for lz4 compression
+ uint64_t lz4write(const char* buf, uint64_t size);
+
+ public:
+ // constructor initializes all member variables
+ oAlnStream(uint16_t ln, uint16_t tl, uint16_t cl, std::string rt, CountType rl, uint32_t nr, uint64_t bs, uint8_t fmt);
+
+ // open Alignment stream file and write header
+ uint64_t open(std::string fname);
+
+ // writes a read alignment to the output Alignment file.
+ // Buffering is handled internally
+ uint64_t write_alignment(ReadAlignment & al);
+
+ // checks if the correct number of alignments was written and closes the Alignment file
+ bool close();
+};
+
+
+
+// Input alignment stream: loads read alignments from a file one by one
+class iAlnStream {
+ // dataset information for the header
+ uint16_t lane;
+ uint16_t tile;
+ uint16_t cycle;
+ std::string root;
+ CountType rlen;
+ uint32_t num_reads;
+
+ // number of reads loaded from file
+ uint32_t num_loaded;
+
+ // data buffer (read blocks of data)
+ std::vector<char> buffer;
+
+ // size of the buffer
+ uint64_t buf_size;
+
+ // current position in the buffer
+ uint64_t buf_pos;
+
+ // output file format
+ // 0: no compression
+ // 1: zlib compression (level 1)
+ // 11: lz4 compression (level 1)
+ uint8_t format;
+
+ // file pointer
+ FILE* ifile;
+ gzFile izfile;
+
+ // read function for LZ4 compression. Reads one block of data to buffer
+ uint64_t lz4read_block();
+
+ public:
+ // constructor initializes only block size and file format
+ iAlnStream(uint64_t bs, uint8_t fmt);
+
+ // open Alignment stream file and load header
+ uint64_t open(std::string fname);
+
+ // loads a read alignment from the input Alignment file.
+ // Buffering is handled internally
+ ReadAlignment get_alignment();
+
+ // checks if the correct number of alignments was loaded and closes the Alignment file
+ bool close();
+
+ // get dataset information
+ inline uint16_t get_lane() {return lane;};
+ inline uint16_t get_tile() {return tile;};
+ inline uint16_t get_cycle() {return cycle;};
+ inline std::string get_root() {return root;};
+ inline CountType get_rlen() {return rlen;};
+ inline uint32_t get_num_reads() {return num_reads;};
+ inline uint32_t get_num_loaded() {return num_loaded;};
+};
+
+
+//-------------------------------------------------------------------//
+//------ The StreamedAlignment class ------------------------------//
+//-------------------------------------------------------------------//
+
+class StreamedAlignment {
+
+ // dataset information
+ uint16_t lane;
+ uint16_t tile;
+ std::string root; // the BaseCalls directory
+ CountType rlen;
+
+ // fetch the next read from the input stream
+ ReadAlignment get_next_read();
+
+ // write an alignment to the output stream
+ uint64_t write_alignment(ReadAlignment& ral);
+
+ // get the path to the bcl file of a given cycle
+ std::string get_bcl_file(uint16_t cycle);
+
+ // get the path to the alignment file. The alignment file is located in
+ // <base>/L00<lane>/s_<lane>_<tile>.<cycle>.align
+ // if base == "": base = root
+ std::string get_alignment_file(uint16_t cycle, std::string base = "");
+
+ // get the path to the filter file. The illumina filter information is located in
+ // <root>/L00<lane>/s_<lane>_<tile>.filter
+ std::string get_filter_file();
+
+ public:
+ StreamedAlignment(uint16_t ln, uint16_t tl, std::string rt, CountType rl): lane(ln), tile(tl), root(rt), rlen(rl) {};
+
+ // create directories required to store the alignment files (only if not stored in root)
+ void create_directories(AlignmentSettings* settings);
+
+ // initialize empty alignment. Creates files for a virtual Cycle 0
+ void init_alignment(AlignmentSettings* settings);
+
+ // extend an existing alignment from cycle <cycle-1> to <cycle>
+ uint64_t extend_alignment(uint16_t cycle, KixRun* index, AlignmentSettings* settings);
+
+}; /* END class StreamedAlignment */
+
+
+
+
+//-------------------------------------------------------------------//
+//------ Streamed SAM generation -----------------------------------//
+//-------------------------------------------------------------------//
+
+uint64_t alignments_to_sam(uint16_t ln, uint16_t tl, std::string rt, CountType rl, KixRun* index, AlignmentSettings* settings);
+
+
+#endif /* ALNSTREAM_H */
diff --git a/lib/config.h.in b/lib/config.h.in
new file mode 100644
index 0000000..1694736
--- /dev/null
+++ b/lib/config.h.in
@@ -0,0 +1,7 @@
+// the k-mer length
+#define K @HiLive_K@
+
+// the HiLive Version Number
+#define HiLive_VERSION_MAJOR @HiLive_VERSION_MAJOR@
+#define HiLive_VERSION_MINOR @HiLive_VERSION_MINOR@
+
diff --git a/lib/definitions.h b/lib/definitions.h
new file mode 100644
index 0000000..7080fe8
--- /dev/null
+++ b/lib/definitions.h
@@ -0,0 +1,157 @@
+#ifndef DEFINITIONS_H
+#define DEFINITIONS_H
+
+#include "headers.h"
+
+
+
+// bit representation of A/C/G/T.
+#define twobit_repr(ch) ((toupper(ch)) == 'A' ? 0LL : \
+ (toupper(ch)) == 'C' ? 1LL : \
+ (toupper(ch)) == 'G' ? 2LL : 3LL)
+
+// complement bit representation of A/C/G/T.
+#define twobit_comp(ch) ((toupper(ch)) == 'A' ? 3LL : \
+ (toupper(ch)) == 'C' ? 2LL : \
+ (toupper(ch)) == 'G' ? 1LL : 0LL)
+
+// bit representation to character
+#define revtwobit_repr(n) ((n) == 0 ? 'A' : \
+ (n) == 1 ? 'C' : \
+ (n) == 2 ? 'G' : 'T')
+
+
+// Allowed characters in sequence
+const std::string seq_chars = "ACGT";
+
+// largest number we're going to hash into. (8 bytes/64 bits/32 nt)
+// probably 32 bit/16 nt are enough here
+typedef uint64_t HashIntoType;
+
+// construct a mask to truncate a binary representation of a k-mer to length K
+const HashIntoType MASK = HashIntoType(pow(4,K))-1;
+
+
+// identifiers for genome sequences
+typedef uint16_t GenomeIdType;
+const GenomeIdType TRIMMED = std::numeric_limits<GenomeIdType>::max();
+
+// list of genome identifiers
+typedef std::vector<GenomeIdType> GenomeIdListType;
+
+// list of strings
+typedef std::vector<std::string> StringListType;
+
+
+// position in a genome
+typedef int32_t PositionType;
+
+// pair of genome ID and position
+struct GenomePosType {
+ GenomeIdType gid;
+ PositionType pos;
+
+ GenomePosType()=default;
+ GenomePosType(GenomeIdType g, PositionType p): gid(g), pos(p) {};
+};
+//typedef std::tuple<GenomeIdType,PositionType> GenomePosType;
+
+// size of a pair of genome ID and position
+const uint64_t GenomePos_size = sizeof(GenomeIdType) + sizeof(PositionType);
+
+// compare function to sort GenomePosType objects by position
+bool gp_compare (GenomePosType i,GenomePosType j);
+
+// vector of ID:position pairs
+typedef std::vector<GenomePosType> GenomePosListType;
+
+// iterator on GenomePosList
+typedef GenomePosListType::iterator GenomePosListIt;
+
+
+// the k-mer index array
+const HashIntoType n_kmer = pow(4,K);
+typedef std::array<GenomePosListType,n_kmer> KmerIndexType;
+
+
+// small counters
+typedef uint16_t CountType;
+
+
+// difference between k-mer position in the read and matching position in the reference
+typedef int16_t DiffType;
+
+// define a mismatch as max(DiffType)
+const DiffType NO_MATCH = std::numeric_limits<DiffType>::max();
+
+// define a trimmed position as max(DiffType)-1
+const DiffType TRIMMED_MATCH = std::numeric_limits<DiffType>::max()-1;
+
+
+
+// all user parameters are stored in the alignment settings
+struct AlignmentSettings {
+ // PARAMETER: Base Call quality cutoff, treat BC with quality < bc_cutoff as miscall
+ uint8_t min_qual = 1;
+
+ // PARAMETER: max. insert/deletion size
+ DiffType window = 50;
+
+ // PARAMETER: minimum number of errors allowed in alignment
+ CountType min_errors = 6;
+
+ // SWITCH: discard One-hit-wonders
+ bool discard_ohw = true;
+
+ // PARAMETER: first cycle to discard one-hit-wonders
+ CountType start_ohw = K+5;
+
+ // SWITCH: Best-Hit-Mode
+ bool best_hit_mode = true;
+
+ // SWITCH: Best-N-Mode
+ bool best_n_mode = false;
+
+ // PARAMETER: Best-N-Mode::N
+ CountType best_n = 1;
+
+ // SWITCH: sort positions found in index. Saves you a few seconds when turned off, but messes everything up when index is not sorted.
+ bool sort_positions = false;
+
+ // PARAMETER: temporary directory for the streamed alignment
+ std::string temp_dir = "";
+
+ // PARAMETER: directory for SAM file output
+ std::string sam_dir = "";
+
+ // SWITCH: Keep the old alignment files of previous cycles
+ bool keep_aln_files = true;
+
+ // PARAMETER: Memory block size for the input and output buffer in the streamed alignment
+ uint64_t block_size = 64*1024*1024; /* 64 MB */
+
+ // PARAMETER: Compression format for alignment files
+ uint8_t compression_format = 0;
+
+ // initialize with default values
+ AlignmentSettings() : min_qual(1),
+ window(5),
+ min_errors(2),
+ discard_ohw(true),
+ start_ohw(K+5),
+ best_hit_mode(true),
+ best_n_mode(false),
+ best_n(1),
+ sort_positions(false),
+ temp_dir(""),
+ sam_dir(""),
+ keep_aln_files(true),
+ block_size(64*1024*1024),
+ compression_format(0) {};
+
+};
+
+
+
+
+#endif /* DEFINITIONS_H */
diff --git a/lib/headers.h b/lib/headers.h
new file mode 100644
index 0000000..1869b16
--- /dev/null
+++ b/lib/headers.h
@@ -0,0 +1,24 @@
+#include "config.h"
+#include <iostream>
+#include <fstream>
+#include <zlib.h>
+#include <lz4.h>
+#include <vector>
+#include <array>
+#include <queue>
+#include <string>
+#include <cstring>
+#include <math.h>
+#include <assert.h>
+#include <tuple>
+#include <algorithm>
+#include <list>
+#include <limits>
+#include <cstdint>
+#include <sstream>
+#include <memory>
+#include <thread>
+#include <mutex>
+#include <chrono>
+#include <stdexcept>
+#include <boost/filesystem.hpp>
diff --git a/lib/illumina_parsers.cpp b/lib/illumina_parsers.cpp
new file mode 100644
index 0000000..fafbf84
--- /dev/null
+++ b/lib/illumina_parsers.cpp
@@ -0,0 +1,70 @@
+#include "illumina_parsers.h"
+
+
+// Constructor takes filename and directly loads the whole file
+uint64_t BclParser::open (std::string fname) {
+ // read the whole file as a chunk
+ data = read_binary_file(fname);
+ // extract the number of reads
+ memcpy(&num_reads,data.data(),4);
+ // set the position pointer to the beginning of the data block
+ position = 4;
+
+ return data.size();
+}
+
+// Get the next base call
+char BclParser::next() {
+ if ( position < data.size() ) {
+ position++;
+ return *(data.data()+position-1);
+ }
+ else {
+ throw std::runtime_error("Error reading BCL file: requested position is beyond EOF." );
+ }
+}
+
+// Check if there are base calls left
+bool BclParser::has_next() {
+ return (position < data.size());
+}
+
+// Returns the total number of base calls in the file
+uint32_t BclParser::size() {
+ return num_reads;
+}
+
+
+
+// Constructor takes filename and directly loads the whole file
+uint64_t FilterParser::open (std::string fname) {
+ // read the whole file as a chunk
+ data = read_binary_file(fname);
+ // extract the number of reads
+ memcpy(&num_reads,data.data()+8,4);
+ // set the position pointer to the beginning of the data block
+ position = 12;
+
+ return data.size();
+}
+
+// Get the next filter flag
+bool FilterParser::next() {
+ if ( position < data.size() ) {
+ position++;
+ return (*(data.data()+position-1) > 0);
+ }
+ else {
+ throw std::runtime_error("Error reading filter file: requested position is beyond EOF." );
+ }
+}
+
+// Check if there are filter flags left
+bool FilterParser::has_next() {
+ return (position < data.size());
+}
+
+// Returns the total number of filter flags in the file
+uint32_t FilterParser::size() {
+ return num_reads;
+}
diff --git a/lib/illumina_parsers.h b/lib/illumina_parsers.h
new file mode 100644
index 0000000..a875252
--- /dev/null
+++ b/lib/illumina_parsers.h
@@ -0,0 +1,59 @@
+#ifndef ILLUMINA_PARSERS_H
+#define ILLUMINA_PARSERS_H
+
+#include "headers.h"
+#include "definitions.h"
+#include "tools.h"
+
+// BCL file parser
+class BclParser {
+ // storage for the raw binary data
+ std::vector<char> data;
+ // current position in data
+ uint32_t position;
+ // number of reads in this bcl file
+ uint32_t num_reads;
+ public:
+ // open file and directly load all data
+ uint64_t open(std::string fname);
+
+ // Get the next base call
+ char next();
+
+ // Check if there are base calls left
+ bool has_next();
+
+ // Returns the total number of base calls in the file
+ uint32_t size();
+};
+
+
+// filter file parser
+class FilterParser {
+ // storage for the raw binary data
+ std::vector<char> data;
+ // current position in data
+ uint32_t position;
+ // number of reads in this filter file
+ uint32_t num_reads;
+ public:
+ // open file and directly load all data
+ uint64_t open(std::string fname);
+
+ // Get the next filter flag
+ bool next();
+
+ // Check if there are filter flags left
+ bool has_next();
+
+ // Returns the total number of filter flags in the file
+ uint32_t size();
+};
+
+
+// clocs file parser
+
+
+
+
+#endif /* ILLUMINA_PARSERS_H */
diff --git a/lib/kindex.cpp b/lib/kindex.cpp
new file mode 100644
index 0000000..b03d73f
--- /dev/null
+++ b/lib/kindex.cpp
@@ -0,0 +1,595 @@
+#include "kindex.h"
+
+
+
+int KixBuild::add_fasta(const std::string &fname) {
+ GenomeIdListType added_ids;
+ return add_fasta(fname, added_ids);
+}
+
+int KixBuild::add_fasta(const std::string &fname, GenomeIdListType &ids) {
+ //std::cout << "Sequence " << num_seq << ". Reading file " << fname << std::endl;
+ std::ios::sync_with_stdio(false);
+ std::ifstream::sync_with_stdio(false);
+
+ std::ifstream infile (fname.c_str());
+ assert(infile.is_open());
+
+ std::string line, seq;
+ HashIntoType fw; // forward k-mer
+ PositionType pos = 1; // use 1-based positions (to allow for negative positions)
+ GenomeIdType seq_id = 0;
+ std::string seq_name;
+ GenomeIdListType added_ids;
+ unsigned long int last_invalid = K+1; // number of characters since the last invalid character (e.g. N)
+
+ while(getline(infile, line)) {
+ if (line.length() == 0) {continue;};
+
+ if (line[0] == '>') { // header line
+ // initialize a new sequence
+ pos = 1;
+ fw = 0;
+ num_seq += 1;
+ seq_id = num_seq - 1;
+ last_invalid = K+1;
+ if (line.find(" ") != std::string::npos) {
+ seq_name = line.substr(0,line.find(" "));
+ } else {
+ seq_name = line;
+ }
+ added_ids.push_back(seq_id);
+ seq_names.push_back(seq_name);
+ seq_lengths.push_back(0);
+ assert(seq_names.size() == num_seq);
+ }
+ else { // sequence line
+ const char * lstr = line.c_str();
+ if (pos==1) { // beginning of a sequence
+ last_invalid = hash_fw(lstr, fw); // hash the first k-mer
+ seq_lengths.back()+=1;
+
+ if (last_invalid > K) {
+ add_kmer(fw,seq_id,pos);
+ }
+
+ for (unsigned int i = K; i < line.length(); i++) {
+ pos = i - K + 1 + 1; // use 1-based positions (to allow for negative positions)
+ seq_lengths.back()+=1;
+
+ if ( seq_chars.find(lstr[i]) == std::string::npos ) {
+ last_invalid = 1;
+ continue;
+ } else {
+ last_invalid += 1;
+ }
+
+ update_kmer(fw, twobit_repr(lstr[i]));
+
+ // add k-mer to database
+ if (last_invalid > K) {
+ add_kmer(fw,seq_id,pos);
+ }
+ }
+ }
+ else { // continue a sequence
+ for (unsigned int i = 0; i < line.length(); i++) {
+ pos++;
+ seq_lengths.back()+=1;
+
+ if ( seq_chars.find(lstr[i]) == std::string::npos ) {
+ last_invalid = 1;
+ continue;
+ } else {
+ last_invalid += 1;
+ }
+
+ update_kmer(fw, twobit_repr(lstr[i]));
+
+ // add k-mer to database
+ if (last_invalid > K) {
+ add_kmer(fw,seq_id,pos);
+ }
+ }
+ }
+ }
+
+ }
+ infile.close();
+ ids = added_ids;
+ return added_ids.size();
+}
+
+
+
+GenomeIdType KixBuild::add_sequence(const std::string &s) {
+ /* Add all k-mers in a sequence string s to the database.
+ A new ID is created for this sequence.
+ Return: sequence ID
+ */
+
+ assert(seq_names.size() == num_seq);
+
+ // increase the sequence counter
+ num_seq += 1;
+
+ // add a default name for the sequence
+ seq_names.push_back(std::string("Sequence_")+std::to_string(num_seq));
+ seq_lengths.push_back(0);
+
+ PositionType pos = 1; // use 1-based positions (to allow for negative positions)
+ const char * sp = s.c_str();
+ unsigned int length = s.length();
+
+ // the current k-mers
+ HashIntoType fw; // forward k-mer
+ HashIntoType last_invalid = hash_fw(sp, fw); // hash the first k-mer
+ seq_lengths.back()+=1;
+ if (last_invalid > K) {
+ add_kmer(fw,num_seq-1,pos);
+ }
+
+ for (unsigned int i = K; i < length; i++) {
+ pos = i - K + 1 + 1; // use 1-based positions (to allow for negative positions)
+ seq_lengths.back()+=1;
+ if ( seq_chars.find(sp[i]) == std::string::npos ) {
+ last_invalid = 1;
+ continue;
+ } else {
+ last_invalid += 1;
+ }
+
+ update_kmer(fw, twobit_repr(sp[i]));
+
+ // add k-mer to database
+ if (last_invalid > K) {
+ add_kmer(fw,num_seq-1,pos);
+ }
+ }
+
+ return num_seq;
+}
+
+
+int KixBuild::add_kmer(HashIntoType kmer, GenomeIdType id, PositionType pos) {
+ assert(kmer < db.size());
+ assert(id < num_seq);
+ GenomePosType gp;
+ gp.gid = id;
+ gp.pos = pos;
+ db[kmer].push_back(gp);
+ return 1;
+}
+
+/* Trim the k-mer index: removes all k-mers with more than
+ max_count occurrences in the reference genomes. Trimmed
+ k-mers are marked by the GenomeIdType TRIMMED (from
+ definitions.h). */
+uint64_t KixBuild::trim(uint64_t max_count) {
+
+ uint64_t trimmed = 0;
+ GenomePosType gp_trimmed (TRIMMED,0);
+
+ for (auto it = db.begin(); it != db.end(); ++it) {
+ if ((*it).size() > max_count) {
+ trimmed += (*it).size();
+ (*it).clear();
+ (*it).push_back(gp_trimmed);
+ }
+ }
+
+ return trimmed;
+}
+
+
+
+std::string KixBuild::get_name(GenomeIdType id) {
+ return seq_names[id];
+}
+
+
+std::vector<char> KixBuild::serialize() {
+ // first of all, sort the database entries by position
+ for (auto it = db.begin(); it != db.end(); ++it) {
+ std::sort(it->begin(), it->end(), gp_compare);
+ }
+
+ // calculate total size
+ unsigned long int total_size = 0;
+
+ // K itself
+ total_size += 1;
+
+ // total number of sequences in database
+ total_size += sizeof(GenomeIdType);
+
+ // sequence names
+ for (uint32_t i = 0; i < seq_names.size(); i++) {
+ uint16_t nm_length = seq_names.size();
+ total_size += sizeof(uint16_t);
+ total_size += nm_length;
+ }
+
+ // reference sequence lengths
+ for (uint32_t i = 0; i < seq_lengths.size(); i++) {
+ total_size += sizeof(uint64_t);
+ }
+
+ // database entries
+ for (auto it = db.begin(); it != db.end(); ++it) {
+ total_size += sizeof(uint32_t); // number of positions
+
+ uint32_t num_positions = (*it).size();
+ total_size += num_positions*(sizeof(GenomeIdType) + sizeof(PositionType));
+ }
+
+
+ // create the vector to store the data
+ std::vector<char> data (total_size);
+ char* d = data.data();
+
+ // write K
+ uint8_t kk = K;
+ memcpy(d,&kk,1);
+ d++;
+
+ // total number of sequences in database
+ memcpy(d,&num_seq,sizeof(GenomeIdType));
+ d += sizeof(GenomeIdType);
+
+ // sequence names
+ for (uint32_t i = 0; i < seq_names.size(); i++) {
+ uint16_t nm_length = seq_names[i].size();
+
+ memcpy(d,&nm_length,sizeof(uint16_t));
+ d += sizeof(uint16_t);
+
+ memcpy(d,seq_names[i].c_str(),nm_length);
+ d += nm_length;
+ }
+ for (uint32_t i = 0; i < seq_lengths.size(); i++) {
+ uint64_t seq_len = seq_lengths[i];
+ memcpy(d,&seq_len,sizeof(uint64_t));
+ d += sizeof(uint64_t);
+ }
+
+ // database entries
+ for (auto it = db.begin(); it != db.end(); ++it) {
+ // number of positions
+ uint32_t num_positions = (*it).size();
+ memcpy(d,&num_positions,sizeof(uint32_t));
+ d += sizeof(uint32_t);
+
+ // genome ID and position
+ for( uint32_t i = 0; i < num_positions; i++) {
+ GenomeIdType gid = (*it)[i].gid;
+ PositionType pos = (*it)[i].pos;
+
+ memcpy(d,&gid,sizeof(GenomeIdType));
+ d += sizeof(GenomeIdType);
+
+ memcpy(d,&pos,sizeof(PositionType));
+ d += sizeof(PositionType);
+ }
+ }
+
+ return data;
+}
+
+uint64_t KixBuild::serialize_file(std::string f) {
+ std::string fname = f;
+
+ // serialize data
+ std::vector<char> sdata = serialize();
+
+ // open binary file
+ FILE* ofile;
+ ofile = fopen(fname.c_str(), "wb");
+
+ if (!ofile) {
+ std::cerr << "Error serializing object to file " << fname << ": Could not open file for writing." << std::endl;
+ return 0;
+ }
+
+ // write all data
+ uint64_t written = fwrite(sdata.data(), 1, sdata.size(), ofile);
+
+ // close file
+ fclose(ofile);
+
+ if (written != sdata.size()){
+ std::cerr << "Error serializing object to file " << fname << ": Total size: " << sdata.size() << " bytes. Written: " << written << " bytes." << std::endl;
+ }
+
+ return written;
+}
+
+
+uint64_t KixBuild::deserialize(char* d) {
+ // the total number of bytes read
+ uint64_t bytes = 0;
+
+ // read and check K
+ uint8_t kk;
+ memcpy(&kk,d+bytes,1);
+ bytes++;
+ assert(K == kk);
+
+ // read total number of sequences in database
+ memcpy(&num_seq,d+bytes,sizeof(GenomeIdType));
+ bytes += sizeof(GenomeIdType);
+
+
+ // sequence names
+ seq_names.clear();
+ seq_lengths.clear();
+ for (uint32_t i = 0; i < num_seq; i++) {
+ uint16_t nm_length;
+ memcpy(&nm_length,d+bytes,sizeof(uint16_t));
+ bytes += sizeof(uint16_t);
+
+ char tmp[nm_length+1];
+ memcpy(tmp,d+bytes,nm_length);
+ tmp[nm_length] = 0; // make the string null-terminated
+ seq_names.push_back(tmp);
+ bytes += nm_length;
+ }
+ for (uint32_t i = 0; i < num_seq; i++) {
+ uint64_t seq_len;
+ memcpy(&seq_len,d+bytes,sizeof(uint64_t));
+ bytes += sizeof(uint64_t);
+ seq_lengths.push_back(seq_len);
+ }
+
+
+ // database entries
+ for (auto it = db.begin(); it != db.end(); ++it) {
+ // number of positions
+ uint32_t num_positions;
+ memcpy(&num_positions,d+bytes,sizeof(uint32_t));
+ bytes += sizeof(uint32_t);
+ (*it).clear();
+ (*it).reserve(num_positions);
+
+ // genome ID and position
+ for( uint32_t i = 0; i < num_positions; i++) {
+ GenomeIdType gid;
+ PositionType pos;
+ GenomePosType gp;
+
+ memcpy(&gid,d+bytes,sizeof(GenomeIdType));
+ bytes += sizeof(GenomeIdType);
+
+ memcpy(&pos,d+bytes,sizeof(PositionType));
+ bytes += sizeof(PositionType);
+
+ gp.gid = gid;
+ gp.pos = pos;
+
+ (*it).push_back(gp);
+ }
+ }
+
+ return bytes;
+}
+
+
+uint64_t KixBuild::deserialize_file(std::string f) {
+ std::string fname = f;
+
+ // obtain file size
+ unsigned long int size = get_filesize(fname);
+
+ // open binary file
+ FILE* ifile;
+ ifile = fopen(fname.c_str(), "rb");
+
+ if (!ifile) {
+ std::cerr << "Error reading from file " << fname << ": Could not open file." << std::endl;
+ return 0;
+ }
+
+ // allocate memory
+ std::vector<char> sdata (size);
+
+ // read all data
+ unsigned long int read = fread(sdata.data(), 1, size, ifile);
+
+ // close file
+ fclose (ifile);
+
+ if (read != size){
+ std::cerr << "Error reading from file " << fname << ": File size: " << size << " bytes. Read: " << read << " bytes." << std::endl;
+ return 0;
+ }
+
+ // deserialize data
+ deserialize(sdata.data());
+
+ return read;
+}
+
+
+
+
+
+
+bool gp_compare (GenomePosType i,GenomePosType j) {
+ return (i.pos < j.pos);
+}
+
+
+
+
+
+uint64_t KixRun::deserialize(char* d) {
+ // the total number of bytes read
+ uint64_t bytes = 0;
+
+ // read and check K
+ uint8_t kk;
+ memcpy(&kk,d+bytes,1);
+ bytes++;
+ assert(kk == K);
+
+ // read total number of sequences in database
+ memcpy(&num_seq,d+bytes,sizeof(GenomeIdType));
+ bytes += sizeof(GenomeIdType);
+
+ // sequence names
+ seq_names.clear();
+ seq_lengths.clear();
+ for (uint32_t i = 0; i < num_seq; i++) {
+ uint16_t nm_length;
+ memcpy(&nm_length,d+bytes,sizeof(uint16_t));
+ bytes += sizeof(uint16_t);
+
+ char tmp[nm_length+1];
+ memcpy(tmp,d+bytes,nm_length);
+ tmp[nm_length] = 0; // make the string null-terminated
+ seq_names.push_back(tmp);
+ bytes += nm_length;
+ }
+
+ // sequence lengths
+ for (uint32_t i = 0; i < num_seq; i++) {
+ uint64_t seq_len;
+ memcpy(&seq_len,d+bytes,sizeof(uint64_t));
+ bytes += sizeof(uint64_t);
+ seq_lengths.push_back(seq_len);
+ }
+
+
+ // database entries
+ for (auto it = db.begin(); it != db.end(); ++it) {
+ // number of positions
+ uint32_t* num_positions = (uint32_t*)(d+bytes);
+
+ // total size of data block for this k-mer
+ uint32_t total_size = sizeof(uint32_t) + (*num_positions)*GenomePos_size;
+
+ // allocate the memory
+ (*it) = d+bytes;
+
+ // increase pointer
+ bytes += total_size;
+ }
+
+ return bytes;
+}
+
+
+
+uint64_t KixRun::deserialize_file(std::string f) {
+ std::string fname = f;
+
+ sdata = read_binary_file(f);
+ /*
+ // obtain file size
+ unsigned long int size = get_filesize(fname);
+
+ // open binary file
+ FILE* ifile;
+ ifile = fopen(fname.c_str(), "rb");
+
+ if (!ifile) {
+ std::cerr << "Error reading from file " << fname << ": Could not open file." << std::endl;
+ return 0;
+ }
+
+ // allocate memory
+ sdata.resize(size,0);
+
+ // read all data
+ unsigned long int read = fread(sdata.data(), 1, size, ifile);
+
+ // close file
+ fclose (ifile);
+
+ if (read != size){
+ std::cerr << "Error reading from file " << fname << ": File size: " << size << " bytes. Read: " << read << " bytes." << std::endl;
+ return 0;
+ }
+ */
+ // deserialize data
+ deserialize(sdata.data());
+
+ return sdata.size();
+}
+
+
+
+std::string KixRun::get_name(GenomeIdType id) {
+ return seq_names[id];
+}
+
+
+/* Retrieve all occurrences (fwd & rc) of kmer in the reference from the index */
+GenomePosListType KixRun::retrieve_positions(HashIntoType kmer) {
+
+ // get the reverse complement of the kmer
+ HashIntoType kmer_rc = rc(kmer);
+
+ // obtain the list of positions for each k-mer
+ char* fwd_begin = db[kmer];
+ uint32_t fwd_len;
+ memcpy(&fwd_len,fwd_begin,sizeof(uint32_t));
+ char* rev_begin = db[kmer_rc];
+ uint32_t rev_len;
+ memcpy(&rev_len,rev_begin,sizeof(uint32_t));
+
+ // the position list: all positions in all genomes, where the current k-mer was found
+ GenomePosListType pos;
+ pos.reserve(fwd_len+rev_len);
+
+ // indicate reverse complement hits by negative position and append in reverse order
+ for(uint64_t i = 0; i < rev_len; i++) {
+ GenomePosType rev;
+ memcpy(&rev.gid,rev_begin+sizeof(uint32_t)+(rev_len-1-i)*GenomePos_size, sizeof(GenomeIdType));
+ memcpy(&rev.pos,rev_begin+sizeof(uint32_t)+(rev_len-1-i)*GenomePos_size+sizeof(GenomeIdType), sizeof(PositionType));
+ rev.pos = - rev.pos;
+ if (rev.gid == TRIMMED) {
+ pos.clear();
+ pos.push_back(GenomePosType(TRIMMED,0));
+ return pos;
+ }
+ pos.push_back(rev);
+ }
+
+ // then add the forward hits with positive position in normal order --> position list is sorted if index was sorted!
+ for(uint64_t i = 0; i < fwd_len; i++) {
+ GenomePosType fwd;
+ memcpy(&fwd.gid,fwd_begin+sizeof(uint32_t)+i*GenomePos_size, sizeof(GenomeIdType));
+ memcpy(&fwd.pos,fwd_begin+sizeof(uint32_t)+i*GenomePos_size+sizeof(GenomeIdType), sizeof(PositionType));
+ if (fwd.gid == TRIMMED) {
+ pos.clear();
+ pos.push_back(GenomePosType(TRIMMED,0));
+ return pos;
+ }
+ pos.push_back(fwd);
+ }
+
+ return pos;
+}
+
+
+// generate a SAM compliant header string
+std::string KixRun::get_SAM_header() {
+ std::stringstream s;
+
+ // header
+ s << "@HD\tVN:1.5\tSO:unsorted" << std::endl;
+
+ // sequence information
+ assert(seq_names.size() == seq_lengths.size());
+ for ( uint64_t i = 0; i < seq_names.size(); i++ ) {
+ s << "@SQ\tSN:" << seq_names[i] << "\tLN:" << seq_lengths[i] << std::endl;
+ }
+
+ // program information
+ s << "@PG\tHiLive" << std::endl;
+
+
+ return s.str();
+}
+
diff --git a/lib/kindex.h b/lib/kindex.h
new file mode 100644
index 0000000..ab56d04
--- /dev/null
+++ b/lib/kindex.h
@@ -0,0 +1,103 @@
+#ifndef KINDEX_H
+#define KINDEX_H
+
+#include "headers.h"
+#include "definitions.h"
+#include "tools.h"
+
+
+
+//-------------------------------------------------------------------//
+//------ The k-mer index builder: KixBuild -------------------------//
+//-------------------------------------------------------------------//
+
+class KixBuild {
+ // add a single k-mer to the database
+ // Note: the index uses 1-based positions (to allow for negative positions)
+ int add_kmer(HashIntoType kmer, GenomeIdType id, PositionType pos);
+
+ public:
+
+ // add k-mers of all sequences in FASTA file
+ int add_fasta(const std::string &fname, GenomeIdListType &ids);
+ int add_fasta(const std::string &fname);
+
+ // add all k-mers in a string sequence to the database
+ GenomeIdType add_sequence(const std::string &s);
+
+ // trim the database: remove kmers with more than max_count occurrences
+ uint64_t trim(uint64_t max_count);
+
+ // set the name of a sequence
+ int set_name(GenomeIdType id, const std::string &name);
+
+ // get the name of a sequence
+ std::string get_name(GenomeIdType id);
+
+ // serialize the KixBuild
+ std::vector<char> serialize();
+
+ // serialize and store the KixBuild to a file
+ uint64_t serialize_file(std::string f);
+
+ // deserialize KixBuild
+ uint64_t deserialize(char* d);
+
+ // load and deserialize KixBuild from file
+ uint64_t deserialize_file(std::string f);
+
+
+ GenomeIdType num_seq; // total number of sequences in the database
+ KmerIndexType db; // the database structure itself
+ StringListType seq_names; // names of the sequences in the database
+ std::vector<uint32_t> seq_lengths; // lengths of the sequences in the database
+
+}; // END class KIindex
+
+
+
+//-------------------------------------------------------------------//
+//------ The k-mer runtime index: KixRun ---------------------------//
+//-------------------------------------------------------------------//
+
+
+typedef std::array<char*,n_kmer> KixRunDB;
+
+class KixRun {
+ public:
+ // pointer to the matching positions for a k-mer
+ char* kmer(HashIntoType kmer);
+
+ // get the name of a sequence
+ std::string get_name(GenomeIdType id);
+
+ // retrieve all fwd and rc occurrences of kmer in the index
+ GenomePosListType retrieve_positions(HashIntoType kmer);
+
+ // deserialize Kix
+ uint64_t deserialize(char* d);
+
+ // load and deserialize Kix from file
+ uint64_t deserialize_file(std::string f);
+
+ // generate a SAM compliant header string
+ std::string get_SAM_header();
+
+ // Database content
+ GenomeIdType num_seq; // total number of sequences in the database
+ StringListType seq_names; // names of the sequences in the database
+ std::vector<uint32_t> seq_lengths; // lengths of the sequences in the database
+ KixRunDB db; // the lightweight database structure itself, pointing to sdata
+ std::vector<char> sdata; // actual chunk of data
+}; // END class KixRun
+
+
+
+
+
+
+
+
+
+
+#endif /* KINDEX_H */
diff --git a/lib/parallel.cpp b/lib/parallel.cpp
new file mode 100644
index 0000000..6d54171
--- /dev/null
+++ b/lib/parallel.cpp
@@ -0,0 +1,287 @@
+#include "parallel.h"
+
+
+
+std::ostream& operator<<(std::ostream& os, const Task& t)
+{
+ os << "Lane " << t.lane << " Tile " << t.tile << " Cycle " << t.cycle;
+ return os;
+}
+
+
+// Add element to the task list
+void TaskQueue::push(Task t) {
+ std::lock_guard<std::mutex> lk(m);
+ tasks.push(t);
+}
+
+// Get element from the task list. If TaskList is empty, return NO_TASK.
+Task TaskQueue::pop() {
+ std::lock_guard<std::mutex> lk(m);
+ if (!tasks.empty()) {
+ Task t = tasks.front();
+ tasks.pop();
+ return t;
+ }
+ else {
+ return NO_TASK;
+ }
+}
+
+// return the size of the queue
+uint64_t TaskQueue::size() {
+ std::lock_guard<std::mutex> lk(m);
+ return tasks.size();
+}
+
+
+
+// create a vector with all lane numbers
+std::vector<uint16_t> all_lanes() {
+ std::vector<uint16_t> ln;
+ for (uint16_t l=0; l < 8; l++)
+ ln.push_back(l+1);
+ return ln;
+}
+
+// create a vector with one lane number
+std::vector<uint16_t> one_lane(uint16_t l) {
+ return std::vector<uint16_t> (1,l);
+}
+
+
+// create a vector with all tile numbers
+std::vector<uint16_t> all_tiles() {
+ std::vector<uint16_t> tl;
+ for (uint16_t l = 0; l < 2; l++) {
+ for (uint16_t s = 0; s < 3; s++) {
+ for (uint16_t t = 0; t < 16; t++) {
+ // construct tile number
+ tl.push_back( (l+1)*1000 + (s+1)*100 + (t+1) );
+ }
+ }
+ }
+ return tl;
+}
+
+// create a vector with one tile number
+std::vector<uint16_t> one_tile(uint16_t t) {
+ return std::vector<uint16_t> (1,t);
+}
+
+
+// initialize agenda with root directory and read length only (all lanes, all tiles)
+Agenda::Agenda (std::string rt, uint16_t rl) {
+
+ // add lanes 1-8 to the list
+ std::vector<uint16_t> ln = all_lanes();
+
+ // call the tiles constructor
+ Agenda(rt, rl, ln);
+
+}
+
+// initialize agenda with root directory, read length, and lanes (all tiles)
+Agenda::Agenda (std::string rt, uint16_t rl, std::vector<uint16_t> ln) {
+
+ // add all tiles to the list
+ std::vector<uint16_t> tl = all_tiles();
+
+ // call the full constructor
+ Agenda (rt, rl, ln, tl);
+
+}
+
+// initialize agenda with root directory, read length, lanes, and tiles
+Agenda::Agenda (std::string rt, uint16_t rl, std::vector<uint16_t> ln, std::vector<uint16_t> tl) {
+
+ root = rt;
+ rlen = rl;
+ lanes = ln;
+ tiles = tl;
+
+ // set up the agenda
+ items.clear();
+ for (uint16_t ln_id = 0; ln_id < lanes.size(); ln_id++) {
+ std::vector<std::vector<ItemStatus> > lane_status;
+ for (uint16_t tl_id = 0; tl_id < tiles.size(); tl_id++) {
+ std::vector<ItemStatus> tile_status (rlen, WAITING);
+ lane_status.push_back(tile_status);
+ }
+ items.push_back(lane_status);
+ }
+
+}
+
+// check for BCL files and update item status
+void Agenda::update_status () {
+
+ // iterate over lanes
+ for (uint16_t ln_id = 0; ln_id < items.size(); ++ln_id) {
+
+ // iterate over all tiles
+ for (uint16_t tl_id = 0; tl_id < items[ln_id].size(); ++tl_id) {
+
+ // get the first cycle that is not in the FINISHED status
+ uint16_t first_unfinished = 0;
+ while ( (first_unfinished < items[ln_id][tl_id].size()) && (items[ln_id][tl_id][first_unfinished] == FINISHED)) {
+ first_unfinished++;
+ }
+
+ // if there is one, check if there is a BCL file available
+ if ((first_unfinished != items[ln_id][tl_id].size()) && (items[ln_id][tl_id][first_unfinished] == WAITING)) {
+ std::string this_fname = bcl_name(root, lanes[ln_id], tiles[tl_id], first_unfinished+1);
+ // only change the status if the file exists
+ if ( file_exists(this_fname) ) {
+ // TODO: probably find a way to check if the machine currently writes to that file
+ items[ln_id][tl_id][first_unfinished] = BCL_AVAILABLE;
+ }
+ }
+
+ }
+
+ }
+
+}
+
+
+// generate a new task from the agenda
+Task Agenda::get_task(){
+ // iterate over lanes
+ for (uint16_t ln_id = 0; ln_id < items.size(); ++ln_id) {
+
+ // iterate over all tiles
+ for (uint16_t tl_id = 0; tl_id < items[ln_id].size(); ++tl_id) {
+
+ // check if there is a cycle with an unprocessed BCL file
+ uint16_t unprocessed = 0;
+ while ( (unprocessed < items[ln_id][tl_id].size()) && (items[ln_id][tl_id][unprocessed] != BCL_AVAILABLE)) {
+ unprocessed++;
+ }
+
+ // generate a new task if there is an unprocessed BCL file
+ if ( unprocessed != items[ln_id][tl_id].size() ) {
+ Task t (lanes[ln_id], tiles[tl_id], unprocessed+1, rlen, root);
+ return t;
+ }
+
+ }
+
+ }
+ // return indicator that no new task could be created
+ return NO_TASK;
+}
+
+
+// set a status
+void Agenda::set_status(Task t, ItemStatus status) {
+ // get the lane index
+ uint64_t diff = std::find(lanes.begin(), lanes.end(), t.lane) - lanes.begin();
+ if ( diff >= lanes.size() ) {
+ throw std::out_of_range("Lane ID out of range.");
+ }
+ uint16_t ln_id = diff;
+
+ // get the tile index
+ diff = std::find(tiles.begin(), tiles.end(), t.tile) - tiles.begin();
+ if ( diff >= tiles.size() ) {
+ throw std::out_of_range("Tile ID out of range.");
+ }
+ uint16_t tl_id = diff;
+
+ // get the cycle index
+ if ( (t.cycle > rlen) || (t.cycle == 0) ) {
+ throw std::out_of_range("Cycle out of range.");
+ }
+ uint16_t cl_id = t.cycle -1;
+
+ items[ln_id][tl_id][cl_id] = status;
+}
+
+
+// get the status of a task
+ItemStatus Agenda::get_status(Task t) {
+ // get the lane index
+ uint64_t diff = std::find(lanes.begin(), lanes.end(), t.lane) - lanes.begin();
+ if ( diff >= lanes.size() ) {
+ throw std::out_of_range("Lane ID out of range.");
+ }
+ uint16_t ln_id = diff;
+
+ // get the tile index
+ diff = std::find(tiles.begin(), tiles.end(), t.tile) - tiles.begin();
+ if ( diff >= tiles.size() ) {
+ throw std::out_of_range("Tile ID out of range.");
+ }
+ uint16_t tl_id = diff;
+
+ // get the cycle index
+ if ( (t.cycle > rlen) || (t.cycle == 0) ) {
+ throw std::out_of_range("Cycle out of range.");
+ }
+ uint16_t cl_id = t.cycle -1;
+
+ return items[ln_id][tl_id][cl_id];
+}
+
+
+// check if all items of the agenda were processed, if possible
+bool Agenda::finished() {
+ // check for each tile if either all cycles are finished OR there is a failed status item
+ for (uint16_t ln_id = 0; ln_id < items.size(); ++ln_id) {
+ for (uint16_t tl_id = 0; tl_id < items[ln_id].size(); ++tl_id) {
+ for (uint16_t cl_id = 0; cl_id < items[ln_id][tl_id].size(); ++cl_id) {
+ ItemStatus s = items[ln_id][tl_id][cl_id];
+ if ( s == FAILED ) {
+ // the rest of the tile is "allowed" to be unprocessed --> skip
+ continue;
+ }
+ else if (s != FINISHED) {
+ // otherwise any other status means that the agenda is not finished
+ return false;
+ }
+ }
+ }
+ }
+ return true;
+}
+
+
+// the total number of tasks on the agenda
+uint32_t Agenda::task_count() {
+ return lanes.size() * tiles.size() * rlen;
+}
+
+
+// the total number of finished tasks on the agenda
+uint32_t Agenda::tasks_finished() {
+ uint32_t num_finished = 0;
+ // iterate over all items and count the finished tasks
+ for (uint16_t ln_id = 0; ln_id < items.size(); ++ln_id) {
+ for (uint16_t tl_id = 0; tl_id < items[ln_id].size(); ++tl_id) {
+ for (uint16_t cl_id = 0; cl_id < items[ln_id][tl_id].size(); ++cl_id) {
+ if (items[ln_id][tl_id][cl_id] == FINISHED) {
+ num_finished++;
+ }
+ }
+ }
+ }
+ return num_finished;
+}
+
+
+// generate a complete TaskQueue with tasks to generate SAM files
+// SAM files can only be generated for tiles where all cycles are completed
+std::vector<Task> Agenda::get_SAM_tasks() {
+ std::vector<Task> tv;
+ // find tiles that are completely mapped
+ for (uint16_t ln_id = 0; ln_id < items.size(); ++ln_id) {
+ for (uint16_t tl_id = 0; tl_id < items[ln_id].size(); ++tl_id) {
+ if ( items[ln_id][tl_id][rlen-1] == FINISHED ) {
+ tv.push_back(Task(lanes[ln_id],tiles[tl_id],rlen,rlen,root));
+ }
+ }
+ }
+
+ return tv;
+}
diff --git a/lib/parallel.h b/lib/parallel.h
new file mode 100644
index 0000000..c404592
--- /dev/null
+++ b/lib/parallel.h
@@ -0,0 +1,130 @@
+#ifndef PARALLEL_H
+#define PARALLEL_H
+
+#include "headers.h"
+#include "definitions.h"
+#include "tools.h"
+#include "kindex.h"
+
+//------ Threading tools --------------------------------------------//
+
+// Task data structure. Contains all information for a thread to
+// process a BCL file.
+struct Task {
+ // dataset information
+ uint16_t lane;
+ uint16_t tile;
+ uint16_t cycle;
+ uint16_t rlen;
+ std::string root;
+
+ // constructor initializes all member variables
+ Task(uint16_t ln, uint16_t tl, uint16_t cl, uint16_t rl, std::string rt): lane(ln), tile(tl), cycle(cl), rlen(rl), root(rt) {};
+
+ // overload << operator for printing
+ friend std::ostream& operator<<(std::ostream& os, const Task& t);
+};
+
+inline bool operator==(const Task& l, const Task& r){ return (r.lane==l.lane)&&(r.tile==l.tile)&&(r.cycle==l.cycle)&&(r.rlen==l.rlen)&&(r.root==l.root); }
+inline bool operator!=(const Task& l, const Task& r){ return !(l==r); }
+
+const Task NO_TASK (255,0,0,0,"");
+
+// Task queue data structure. Manages a list of task objects in a thread safe way.
+class TaskQueue {
+ // the internal queue
+ std::queue<Task> tasks;
+
+ // mutex to ensure that only one process can access the queue at once
+ std::mutex m;
+
+ public:
+ // Add element to the task list
+ void push(Task t);
+
+ // Get element from the task list
+ Task pop();
+
+ // return the size of the queue
+ uint64_t size();
+};
+
+
+// Agenda item status
+typedef uint8_t ItemStatus;
+const ItemStatus WAITING = 0;
+const ItemStatus BCL_AVAILABLE = 1;
+const ItemStatus RUNNING = 2;
+const ItemStatus FINISHED = 3;
+const ItemStatus RETRY = 4;
+const ItemStatus FAILED = 5;
+const ItemStatus ERROR = std::numeric_limits<ItemStatus>::max();
+
+
+// Agenda: monitors the sequencing process and manages the alignment
+// - Monitors BCL files
+// - generates new tasks
+// - receive finished/fail signals
+class Agenda {
+ // list of items on the agenda. items[lane][tile][cycle]
+ std::vector< std::vector< std::vector<ItemStatus> > > items;
+
+ // dataset information
+ std::string root;
+ uint16_t rlen;
+ std::vector<uint16_t> lanes;
+ std::vector<uint16_t> tiles;
+
+ public:
+ // initialize agenda with root directory and read length only (all lanes, all tiles)
+ Agenda (std::string rt, uint16_t rl);
+
+ // initialize agenda with root directory, read length, and lanes (all tiles)
+ Agenda (std::string rt, uint16_t rl, std::vector<uint16_t> ln);
+
+ // initialize agenda with root directory, read length, lanes, and tiles
+ Agenda (std::string rt, uint16_t rl, std::vector<uint16_t> ln, std::vector<uint16_t> tl);
+
+ // check for BCL files and update item status
+ void update_status();
+
+ // generate a new task from the agenda
+ Task get_task();
+
+ // set the status of a task
+ void set_status(Task t, ItemStatus status);
+
+ // get the status of a task
+ ItemStatus get_status(Task t);
+
+ // check if all items of the agenda were processed, if possible
+ bool finished();
+
+ // the total number of tasks on the agenda
+ uint32_t task_count();
+
+ // the total number of finished tasks on the agenda
+ uint32_t tasks_finished();
+
+ // generate a complete TaskQueue with tasks to generate SAM files
+ // SAM files can only be generated for tiles where all cycles are completed
+ std::vector<Task> get_SAM_tasks();
+};
+
+
+
+// create a vector with all lane numbers
+std::vector<uint16_t> all_lanes();
+
+// create a vector with one lane number
+std::vector<uint16_t> one_lane(uint16_t l);
+
+// create a vector with all tile numbers
+std::vector<uint16_t> all_tiles();
+
+// create a vector with one tile number
+std::vector<uint16_t> one_tile(uint16_t t);
+
+
+
+#endif /* PARALLEL_H */
diff --git a/lib/tools.cpp b/lib/tools.cpp
new file mode 100644
index 0000000..63efe7d
--- /dev/null
+++ b/lib/tools.cpp
@@ -0,0 +1,253 @@
+#include "tools.h"
+
+// reads a binary file from hdd and stores its raw content in a char vector
+std::vector<char> read_binary_file(const std::string &fname) {
+
+ // get file size
+ uint64_t size = get_filesize(fname);
+
+ // open binary file
+ FILE* f;
+ f = fopen(fname.c_str(), "rb");
+
+ if (!f) {
+ std::cerr << "Error reading binary file " << fname << ": Could not open file." << std::endl;
+ return std::vector<char>();
+ }
+
+ // allocate memory
+ std::vector<char> data (size);
+
+ // read all data at once
+ uint64_t read = fread(data.data(), 1, size, f);
+
+ if (read != size){
+ std::cerr << "Error reading binary file " << fname << ": File size: " << size << " bytes. Read: " << read << " bytes." << std::endl;
+ return std::vector<char>();
+ }
+
+ fclose(f);
+
+ return data;
+}
+
+
+// checks if a directory with the given path exists
+bool is_directory(const std::string &path) {
+ if ( boost::filesystem::exists(path) ) {
+ if ( boost::filesystem::is_directory(path) ) {
+ return true;
+ }
+ else {
+ return false;
+ }
+ }
+ else {
+ return false;
+ }
+}
+
+// checks if a file exists
+bool file_exists(const std::string &fname) {
+ return boost::filesystem::exists(fname);
+ /*if (FILE *f = fopen(fname.c_str(), "r")) {
+ fclose(f);
+ return true;
+ }
+ else {
+ return false;
+ }*/
+}
+
+
+// writes a char vector into a binary file
+uint64_t write_binary_file(const std::string &fname, const std::vector<char> & data) {
+
+ // open binary file
+ FILE* ofile;
+ ofile = fopen(fname.c_str(), "wb");
+
+ if (!ofile) {
+ std::cerr << "Error serializing object to file " << fname << ": Could not open file for writing." << std::endl;
+ return 0;
+ }
+
+ // write all data
+ uint64_t written = fwrite(data.data(), 1, data.size(), ofile);
+
+ // close file
+ fclose(ofile);
+
+ if (written != data.size()){
+ std::cerr << "Error serializing object to file " << fname << ": Total size: " << data.size() << " bytes. Written: " << written << " bytes." << std::endl;
+ }
+
+ return written;
+}
+
+
+
+// extract the number of reads from a BCL file
+uint32_t num_reads_from_bcl(std::string bcl) {
+ // open BCL file of first cycle
+ FILE* ifile;
+ ifile = fopen(bcl.c_str(), "rb");
+
+ if (!ifile) {
+ std::cerr << "Error reading BCL file " << bcl << ": Could not open file." << std::endl;
+ return 0;
+ }
+
+ // extract the number of reads
+ uint32_t num_reads;
+ assert(fread(&num_reads, 1, sizeof(uint32_t), ifile));
+
+ // close file
+ fclose (ifile);
+
+ return num_reads;
+}
+
+
+
+// Update an existing kmer by left-shifting all nucleotides and appending new nucleotide
+void update_kmer(HashIntoType &kmer, HashIntoType nuc) {
+
+ // left-shift the previous k-mer
+ kmer = kmer << 2;
+
+ // 'or' in the current nucleotide
+ // only use the last 2 bits
+ kmer |= (nuc & 3);
+
+ // mask off the 2 bits we shifted over.
+ kmer &= MASK;
+
+}
+
+
+/* get the size (in bytes) of a file */
+std::ifstream::pos_type get_filesize(const std::string &fname)
+{
+ std::ifstream in(fname, std::ios::binary | std::ios::ate);
+ return in.tellg();
+}
+
+
+/* calculates the first forward and reverse complement k-mer in the
+ string <kmer> and returns the canonical representation. */
+HashIntoType hash(const char * kmer, HashIntoType& _h, HashIntoType& _r)
+{
+ assert(strlen(kmer) >= K);
+
+ HashIntoType h = 0, r = 0;
+
+ h |= twobit_repr(kmer[0]);
+ r |= twobit_comp(kmer[K-1]);
+
+ for (unsigned int i = 1, j = K - 2; i < K; i++, j--) {
+ h = h << 2;
+ r = r << 2;
+
+ h |= twobit_repr(kmer[i]);
+ r |= twobit_comp(kmer[j]);
+ }
+
+ _h = h;
+ _r = r;
+
+ return (h)<(r)?h:r;
+}
+
+/* calculates the first forward k-mer in the string <kmer> */
+HashIntoType hash_fw(const char * kmer, HashIntoType& _h)
+{
+ assert(strlen(kmer) >= K);
+
+ HashIntoType h = 0;
+ HashIntoType last_invalid = K+1;
+
+ h |= twobit_repr(kmer[0]);
+
+ for (unsigned int i = 1; i < K; i++) {
+ h = h << 2;
+ h |= twobit_repr(kmer[i]);
+ if ( seq_chars.find(kmer[i]) == std::string::npos ) {
+ last_invalid = K+1-i;
+ }
+ }
+
+ _h = h;
+
+ return last_invalid;
+}
+
+
+/* returns the reverse complement of a k-mer */
+HashIntoType rc(HashIntoType fw)
+{
+
+ HashIntoType rc = 0;
+
+ // Illumina uses
+ // A = 0b00
+ // C = 0b01
+ // G = 0b10
+ // T = 0b11
+ // so, inverting bits produces the rc: rc(A) = ~A
+
+ for (unsigned int i = 0; i < 2*K; i+=2) {
+ rc |= (~(fw >> i) & 3) << (2*K - i - 2);
+ }
+
+ return rc;
+}
+
+
+
+// file name construction functions
+
+// construct BCL file name from: root, lane, tile, cycle
+std::string bcl_name(std::string rt, uint16_t ln, uint16_t tl, uint16_t cl) {
+ std::ostringstream path_stream;
+ path_stream << rt << "/L00" << ln << "/C" << cl << ".1/s_"<< ln <<"_" << tl << ".bcl";
+ return path_stream.str();
+}
+
+
+// construct alignment file name from: root, lane, tile, cycle
+std::string alignment_name(std::string rt, uint16_t ln, uint16_t tl, uint16_t cl) {
+ std::ostringstream path_stream;
+ path_stream << rt << "/L00" << ln << "/s_"<< ln << "_" << tl << "." << cl << ".align";
+ return path_stream.str();
+}
+
+// construct tile-wise SAM file name from: root, lane, tile
+std::string sam_tile_name(std::string rt, uint16_t ln, uint16_t tl) {
+ std::ostringstream path_stream;
+ path_stream << rt << "/L00" << ln << "/s_"<< ln << "_" << tl << ".sam";
+ return path_stream.str();
+}
+
+// construct lane-wise SAM file name from: root, lane
+std::string sam_lane_name(std::string rt, uint16_t ln) {
+ std::ostringstream path_stream;
+ path_stream << rt << "/L00" << ln << "/s_"<< ln << ".sam";
+ return path_stream.str();
+}
+
+// construct filter file name from: root, lane, tile
+std::string filter_name(std::string rt, uint16_t ln, uint16_t tl) {
+ std::ostringstream path_stream;
+ path_stream << rt << "/L00" << ln << "/s_"<< ln << "_" << tl << ".filter";
+ return path_stream.str();
+}
+
+// construct position file name from: root, lane, tile
+std::string position_name(std::string rt, uint16_t ln, uint16_t tl) {
+ std::ostringstream path_stream;
+ path_stream << rt << "../L00" << ln << "/s_"<< ln << "_" << tl << ".clocs";
+ return path_stream.str();
+}
+
+
diff --git a/lib/tools.h b/lib/tools.h
new file mode 100644
index 0000000..667d0b6
--- /dev/null
+++ b/lib/tools.h
@@ -0,0 +1,46 @@
+#ifndef TOOLS_H
+#define TOOLS_H
+
+#include "headers.h"
+#include "definitions.h"
+#include "kindex.h"
+
+// calculates the total size of a file in bytes
+std::ifstream::pos_type get_filesize(const std::string &fname);
+
+// checks if a directory with the given name exists
+bool is_directory(const std::string &path);
+
+// checks if a file exists
+bool file_exists(const std::string &fname);
+
+// reads a binary file from hdd and stores its raw content in a char vector
+std::vector<char> read_binary_file(const std::string &fname);
+
+// extract the number of reads from a BCL file
+uint32_t num_reads_from_bcl(std::string bcl);
+
+// writes a char vector into a binary file
+uint64_t write_binary_file(const std::string &fname, const std::vector<char> & data);
+
+//------ Hashing helper functions ---------------------------------//
+HashIntoType hash(const char * kmer, HashIntoType& _h, HashIntoType& _r);
+HashIntoType hash_fw(const char * kmer, HashIntoType& _h);
+HashIntoType rc(HashIntoType fw);
+
+// Update an existing kmer by left-shifting all nucleotides and appending new nucleotide
+void update_kmer(HashIntoType &kmer, HashIntoType nuc);
+
+// file name construction functions
+std::string bcl_name(std::string rt, uint16_t ln, uint16_t tl, uint16_t cl);
+std::string alignment_name(std::string rt, uint16_t ln, uint16_t tl, uint16_t cl);
+std::string filter_name(std::string rt, uint16_t ln, uint16_t tl);
+std::string position_name(std::string rt, uint16_t ln, uint16_t tl);
+std::string sam_tile_name(std::string rt, uint16_t ln, uint16_t tl);
+std::string sam_lane_name(std::string rt, uint16_t ln);
+
+
+
+
+
+#endif /* TOOLS_H */
diff --git a/tools/.ann.location b/tools/.ann.location
new file mode 100644
index 0000000..573541a
--- /dev/null
+++ b/tools/.ann.location
@@ -0,0 +1 @@
+0
diff --git a/tools/build_index.cpp b/tools/build_index.cpp
new file mode 100644
index 0000000..fc6307d
--- /dev/null
+++ b/tools/build_index.cpp
@@ -0,0 +1,125 @@
+#include <boost/program_options.hpp>
+
+#include "../lib/headers.h"
+#include "../lib/definitions.h"
+#include "../lib/kindex.h"
+
+namespace po = boost::program_options;
+
+std::string license =
+"Copyright (c) 2015, Martin S. Lindner, marzin at mail-lindner.de\n"
+"All rights reserved.\n"
+"\n"
+"Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:\n"
+"\n"
+"1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.\n"
+"\n"
+"2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.\n"
+"\n"
+"3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.\n"
+"\n"
+"THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PRO [...]
+
+
+
+
+int main(int argc, char* argv[]) {
+ // setting up the command line interface
+ po::options_description general("General");
+ general.add_options()
+ ("help,h", "Print this help message and exit")
+ ("license", "Print licensing information and exit");
+
+ po::options_description parameters("Parameters");
+ parameters.add_options()
+ ("INPUT", po::value<std::string>()->required(), "Input reference genome (fasta file)");
+
+ po::options_description options("Options");
+ options.add_options()
+ ("outfile,o", po::value<std::string>(), "Set output file name [Default: INPUT.kix]")
+ ("trim,t", po::value<uint32_t>(), "Ignore k-mers with more than t occurrences. [Default: no limit]");
+
+ po::options_description cmdline_options;
+ cmdline_options.add(general).add(parameters).add(options);
+
+ po::options_description visible_options;
+ visible_options.add(general).add(options);
+
+ std::stringstream help_message;
+ help_message << "HiLive index builder v"<< HiLive_VERSION_MAJOR << "." << HiLive_VERSION_MINOR << std::endl;
+ help_message << "Index creation tool for HiLive - Realtime Alignment of Illumina Reads" << std::endl;
+ help_message << "Copyright (c) 2015, Martin S. Lindner" << std::endl;
+ help_message << "HiLive is open-source software. Check with --license for details." << std::endl << std::endl;
+ help_message << "Fixed k-mer size: " << K << std::endl << std::endl;
+ help_message << "Usage: " << std::string(argv[0]) << " [options] INPUT" << std::endl;
+ help_message << " INPUT Reference genomes in (multi-) FASTA format" << std::endl;
+
+ help_message << visible_options << std::endl;
+
+ po::positional_options_description p;
+ p.add("INPUT", 1);
+
+ po::variables_map vm;
+ try {
+ // parse arguments
+ po::store(po::command_line_parser(argc, argv).
+ options(cmdline_options).positional(p).run(), vm);
+ // first check if -h or --help was called
+ if (vm.count("help")) {
+ std::cout << help_message.str();
+ return 1;
+ }
+ // first check if --license was called
+ if (vm.count("license")) {
+ std::cout << license << std::endl;
+ return 1;
+ }
+
+ // then check arguments
+ po::notify(vm);
+ }
+ catch ( boost::program_options::required_option& e ) {
+ std::cerr << "Missing Parameter: " << e.what() << std::endl << std::endl;
+ std::cout << help_message.str();
+ return -1;
+ }
+ catch( boost::program_options::error& e) {
+ std::cerr << "Error while parsing command line options: " << e.what() << std::endl << std::endl;
+ std::cout << help_message.str();
+ return -1;
+ }
+
+
+ std::string fasta_name = vm["INPUT"].as<std::string>();
+
+ std::string index_name;
+ if (vm.count("outfile")) {
+ // use the output file name if provided as argument
+ index_name = vm["outfile"].as<std::string>();
+ } else {
+ // construct it from the input file name otherwise
+ index_name = fasta_name + std::string(".kix");
+ }
+
+ // get k-mer trimming
+ int trim = 0;
+ if (vm.count("trim")) {
+ trim = vm["trim"].as<uint32_t>();
+ }
+
+
+ std::cout << "Creating index with K=" << K << " from file " << fasta_name << std::endl;
+ KixBuild* index = new KixBuild();
+ index->add_fasta(fasta_name);
+
+ if (trim > 0) {
+ uint64_t trimmed = index->trim(trim);
+ std::cout << "Removed " << trimmed << " k-mer positions from the database." << std::endl;
+ }
+
+
+ std::cout << "Writing index to file " << index_name << std::endl;
+ index->serialize_file(index_name);
+
+ delete index;
+}
diff --git a/tools/hilive.cpp b/tools/hilive.cpp
new file mode 100644
index 0000000..d5ecae4
--- /dev/null
+++ b/tools/hilive.cpp
@@ -0,0 +1,477 @@
+#include <iostream>
+#include <boost/program_options.hpp>
+
+#include "../lib/headers.h"
+#include "../lib/definitions.h"
+#include "../lib/kindex.h"
+#include "../lib/alnstream.h"
+#include "../lib/parallel.h"
+
+
+namespace po = boost::program_options;
+
+std::string license =
+"Copyright (c) 2015, Martin S. Lindner, marzin at mail-lindner.de\n"
+"All rights reserved.\n"
+"\n"
+"Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:\n"
+"\n"
+"1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.\n"
+"\n"
+"2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.\n"
+"\n"
+"3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.\n"
+"\n"
+"THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PRO [...]
+
+
+
+
+// the worker function for the threads
+void worker (TaskQueue & tasks, TaskQueue & finished, TaskQueue & failed, AlignmentSettings* settings, KixRun* idx, bool & surrender ) {
+
+ // loop that keeps on running until the surrender flag is set
+ while ( !surrender ) {
+ // try to obtain a new task
+ Task t = tasks.pop();
+ if ( t != NO_TASK ) {
+ // Execute the task
+ bool success = true;
+ try {
+ StreamedAlignment s (t.lane, t.tile, t.root, t.rlen);
+ uint64_t num_seeds = s.extend_alignment(t.cycle,idx,settings);
+ std::cout << "Task [" << t << "]: Found " << num_seeds << " seeds." << std::endl;
+ }
+ catch (const std::exception &e) {
+ std::cerr << "Failed to finish task [" << t << "]: " << e.what() << std::endl;
+ success = false;
+ }
+ if (success) {
+ finished.push(t);
+ }
+ else {
+ failed.push(t);
+ }
+
+ }
+ else {
+ // send this thread to sleep for a second
+ std::this_thread::sleep_for (std::chrono::seconds(1));
+ }
+ }
+
+}
+
+
+// create a special SAM worker, that writes out a SAM file for a tile
+void sam_worker (TaskQueue & tasks, AlignmentSettings* settings, KixRun* idx) {
+
+ // loop that keeps on running until the surrender flag is set
+ while ( true ) {
+ // try to obtain a new task
+ Task t = tasks.pop();
+ if ( t != NO_TASK ) {
+ // Execute the task
+ alignments_to_sam(t.lane,t.tile,t.root,t.rlen, idx,settings);
+ }
+ else {
+ return;
+ }
+ }
+
+}
+
+
+
+int main(int argc, char* argv[]) {
+ time_t t_start = time(NULL);
+
+ // setting up the command line interface
+ po::options_description general("General");
+ general.add_options()
+ ("help,h", "Print this help message and exit")
+ ("license", "Print licensing information and exit");
+
+ po::options_description parameters("Parameters");
+ parameters.add_options()
+ ("BC_DIR", po::value<std::string>()->required(), "Illumina BaseCalls directory")
+ ("INDEX", po::value<std::string>()->required(), "Path to k-mer index")
+ ("CYCLES", po::value<CountType>()->required(), "Number of cycles");
+
+ po::options_description io_settings("IO settings");
+ io_settings.add_options()
+ ("temp", po::value<std::string>(), "Temporary directory for the alignment files [Default: use BaseCalls directory]")
+ ("sam,S", po::value<std::string>(), "Create SAM files for each tile. [Default: no SAM files]")
+ ("keep-files,k", "Keep intermediate alignment files [Default: false]")
+ ("lanes,l", po::value< std::vector<uint16_t> >(), "Select lane [Default: all lanes]")
+ ("tiles,t", po::value< std::vector<uint16_t> >(), "Select tile numbers [Default: all tiles]");
+
+ po::options_description alignment("Alignment settings");
+ alignment.add_options()
+ ("min-errors,e", po::value<CountType>()->default_value(2), "Number of errors tolerated in read alignment")
+ ("best-hit,H", "Report only the best alignmnet(s) for each read")
+ ("best-n,N", po::value<CountType>()->default_value(2), "Report the N best alignmnets for each read")
+ ("disable-ohw-filter", "Disable the One-Hit Wonder filter")
+ ("start-ohw", po::value<CountType>()->default_value(K+5), "First cycle to apply One-Hit Wonder filter")
+ ("window,w", po::value<DiffType>()->default_value(5), "Set the window size to search for alignment continuation, i.e. maximum insertion/deletion size")
+ ("min-quality", po::value<uint16_t>()->default_value(1), "Minimum allowed basecall quality");
+
+ po::options_description technical("Technical settings");
+ technical.add_options()
+ ("block-size", po::value<uint64_t>(), "Block size for the alignment input/output stream in Bytes. Use -K or -M to specify in Kilobytes or Megabytes")
+ (",K", "Interpret the block-size argument as Kilobytes instead of Bytes")
+ (",M", "Interpret the block-size argument as Megabytes instead of Bytes")
+ ("compression,c", po::value<uint16_t>()->default_value(2), "Compress alignment files. 0: no compression (default) 1: Deflate (smaller) 2: LZ4 (faster)")
+ ("num-threads,n", po::value<int>()->default_value(1), "Number of threads to spawn")
+ ;
+
+ po::options_description cmdline_options;
+ cmdline_options.add(general).add(parameters).add(io_settings).add(alignment).add(technical);
+
+ po::options_description visible_options;
+ visible_options.add(general).add(io_settings).add(alignment).add(technical);
+
+ std::stringstream help_message;
+ help_message << "HiLive v"<< HiLive_VERSION_MAJOR << "." << HiLive_VERSION_MINOR << " - Realtime Alignment of Illumina Reads" << std::endl;
+ help_message << "Copyright (c) 2015, Martin S. Lindner" << std::endl;
+ help_message << "HiLive is open-source software. Check with --license for details." << std::endl << std::endl;
+ help_message << "Fixed k-mer size: " << K << std::endl << std::endl;
+ help_message << "Usage: " << std::string(argv[0]) << " [options] BC_DIR INDEX CYCLES" << std::endl;
+ help_message << " BC_DIR Illumina BaseCalls directory of the sequencing run to analyze" << std::endl;
+ help_message << " INDEX Path to k-mer index file (*.kix)" << std::endl;
+ help_message << " CYCLES Total number of cycles for read 1" << std::endl << std::endl;
+ help_message << visible_options << std::endl;
+
+
+ po::positional_options_description p;
+ p.add("BC_DIR", 1);
+ p.add("INDEX", 1);
+ p.add("CYCLES", 1);
+
+ po::variables_map vm;
+ try {
+ // parse arguments
+ po::store(po::command_line_parser(argc, argv).
+ options(cmdline_options).positional(p).run(), vm);
+ // first check if -h or --help was called
+ if (vm.count("help")) {
+ std::cout << help_message.str();
+ return 1;
+ }
+ // first check if --license was called
+ if (vm.count("license")) {
+ std::cout << license << std::endl;
+ return 1;
+ }
+
+ // then check arguments
+ po::notify(vm);
+ }
+ catch ( boost::program_options::required_option& e ) {
+ std::cerr << "Missing Parameter: " << e.what() << std::endl << std::endl;
+ std::cout << help_message.str();
+ return -1;
+ }
+ catch( boost::program_options::error& e) {
+ std::cerr << "Error while parsing command line options: " << e.what() << std::endl << std::endl;
+ std::cout << help_message.str();
+ return -1;
+ }
+
+
+ // variables to set
+ std::string root = vm["BC_DIR"].as<std::string>();
+ std::string index_fname = vm["INDEX"].as<std::string>();
+ CountType rlen = vm["CYCLES"].as<CountType>();
+
+ std::vector<uint16_t> lanes;
+ std::vector<uint16_t> tiles;
+
+ AlignmentSettings settings;
+
+ bool write_sam;
+ std::string sam_dir;
+
+ if (vm.count("temp"))
+ settings.temp_dir = vm["temp"].as<std::string>();
+ else
+ settings.temp_dir = "";
+
+ if (vm.count("sam")) {
+ write_sam = true;
+ settings.sam_dir = vm["sam"].as<std::string>();
+ if (settings.sam_dir == "") {
+ if (settings.temp_dir == "")
+ settings.sam_dir = root;
+ else
+ settings.sam_dir = settings.temp_dir;
+ }
+ }
+ else
+ write_sam = false;
+
+
+ settings.keep_aln_files = (vm.count("keep-files") > 0);
+
+ if (vm.count("lanes"))
+ lanes = vm["lanes"].as< std::vector<uint16_t> >();
+ else
+ lanes = all_lanes();
+
+ if (vm.count("tiles"))
+ tiles = vm["tiles"].as< std::vector<uint16_t> >();
+ else
+ tiles = all_tiles();
+
+ if (vm.count("min-errors"))
+ settings.min_errors = vm["min-errors"].as<uint16_t>();
+ else
+ settings.min_errors = 2;
+
+ settings.best_hit_mode = (vm.count("best-hit") > 0);
+ if (vm.count("best-hit") == 0 && vm.count("best-n") > 0 ) {
+ settings.best_n_mode = true;
+ settings.best_n = vm["best-n"].as<CountType>();
+ }
+
+ settings.discard_ohw = (vm.count("disable-ohw-filter") == 0);
+
+ if (vm.count("start-ohw"))
+ settings.start_ohw = vm["start-ohw"].as<CountType>();
+ else
+ settings.start_ohw = K+5;
+
+ if (vm.count("window"))
+ settings.window = vm["window"].as<DiffType>();
+ else
+ settings.window = 5;
+
+ if (vm.count("min-quality"))
+ settings.min_qual = vm["min-quality"].as<uint16_t>();
+ else
+ settings.min_qual = K+5;
+
+ if (vm.count("block-size")) {
+ if (vm.count("-M"))
+ settings.block_size = vm["block-size"].as<uint64_t>()*1024*1024;
+ else if (vm.count("-K"))
+ settings.block_size = vm["block-size"].as<uint64_t>()*1024;
+ else
+ settings.block_size = vm["block-size"].as<uint64_t>();
+ }
+ else
+ settings.block_size = 64*1024*1024;
+
+ if (vm.count("compression"))
+ settings.compression_format = (uint8_t) vm["compression"].as<uint16_t>();
+ else
+ settings.compression_format = 2;
+
+ int num_threads;
+ if (vm.count("num-threads"))
+ num_threads = vm["num-threads"].as<int>();
+ else
+ num_threads = 1;
+
+
+ // check paths and file names
+ if (!file_exists(index_fname)){
+ std::cerr << "Input error: Could not find k-mer index file " << index_fname << std::endl;
+ return -1;
+ }
+
+ std::size_t found = root.find("BaseCalls");
+ if (!(found != std::string::npos && found >= root.size()-10)) {
+ std::cerr << "Warning: BaseCalls directory seems to be invalid: " << root << std::endl;
+ }
+
+ if (!is_directory(root)){
+ std::cerr << "Input error: Could not find BaseCalls directory " << root << std::endl;
+ return -1;
+ }
+
+ for ( uint16_t ln : lanes ) {
+ std::string ln_dir = root;
+ if ( ln < 10 )
+ ln_dir += "/L00";
+ else if ( ln < 100 )
+ ln_dir += "/L0";
+ else
+ ln_dir += "/L";
+ ln_dir += std::to_string(ln);
+ if (!is_directory(ln_dir)){
+ std::cerr << "Input error: Could not find location of Lane " << ln << ": " << ln_dir << std::endl;
+ return -1;
+ }
+ }
+
+
+ // Report the basic settings
+ std::cout << "Running HiLive with " << num_threads << " thread(s)." << std::endl;
+ std::cout << "BaseCalls directory: " << root << std::endl;
+ if (settings.temp_dir != "") {
+ std::cout << "Temporary directory: " << settings.temp_dir << std::endl;
+ }
+ if (write_sam) {
+ std::cout << "SAM output directory: " << settings.sam_dir << std::endl;
+ }
+ std::cout << "Lanes: ";
+ for ( uint16_t ln : lanes )
+ std::cout << ln << " ";
+ std::cout << std::endl;
+ std::cout << "K-mer index: " << index_fname << std::endl;
+ std::cout << "Read length: " << rlen << std::endl;
+ std::cout << "Mapping error: " << settings.min_errors << std::endl;
+ if (settings.best_hit_mode)
+ std::cout << "Mapping mode: Best-Hit-Mode" << std::endl;
+ else if (settings.best_n_mode)
+ std::cout << "Mapping mode: Best-N-Mode" << std::endl;
+ else
+ std::cout << "Mapping mode: All-Hits-Mode" << std::endl;
+ std::cout << std::endl;
+
+
+ // load the index
+ std::cout << "Loading Index" << std::endl;
+ KixRun* index = new KixRun();
+ index->deserialize_file(index_fname);
+
+
+ // Create the overall agenda
+ Agenda agenda (root, rlen, lanes, tiles);
+
+
+ // prepare the alignment
+ std::cout << "Initializing Alignment files. Waiting for the first cycle to finish." << std::endl;
+ bool first_cycle_available = false;
+ int wait = 0;
+ int max_wait = 3600;
+ while ( !first_cycle_available && wait < max_wait ) {
+ // check for new BCL files and update the agenda status
+ agenda.update_status();
+
+ // check if the first cycle is available for all tiles
+ first_cycle_available = true;
+ for ( auto ln : lanes ) {
+ for ( auto tl : tiles ) {
+ if ( agenda.get_status(Task(ln,tl,1,rlen,"")) != BCL_AVAILABLE) {
+ first_cycle_available = false;
+ }
+ }
+ }
+
+ // take a small break
+ std::this_thread::sleep_for (std::chrono::milliseconds(1000));
+ wait+=1;
+ }
+
+ if (wait >= max_wait) {
+ std::cerr << "Program aborted. No BCL files found." << std::endl;
+ return -1;
+ }
+ else {
+ std::cout << "First cycle complete. Starting alignment." << std::endl;
+ }
+
+
+ // write empty alignment file for each tile
+ for (uint16_t ln : lanes) {
+ for (uint16_t tl : tiles) {
+ StreamedAlignment s (ln, tl, root, rlen);
+ s.create_directories(&settings);
+ s.init_alignment(&settings);
+ }
+ }
+
+ if (settings.temp_dir != "" && !is_directory(settings.temp_dir)){
+ std::cerr << "Error: Could not find temporary directory " << settings.temp_dir << std::endl;
+ return -1;
+ }
+
+ // Set up the queues
+ TaskQueue toDoQ;
+ TaskQueue finishedQ;
+ TaskQueue failedQ;
+
+ // Create the threads
+ std::cout << "Creating " << num_threads << " threads." << std::endl;
+ bool surrender = false;
+ std::vector<std::thread> workers;
+ for (int i = 0; i < num_threads; i++) {
+ workers.push_back(std::thread(worker, std::ref(toDoQ), std::ref(finishedQ), std::ref(failedQ), &settings, index, std::ref(surrender)));
+ }
+
+ // Process all tasks on the agenda
+ while ( !agenda.finished() ) {
+ // check for new BCL files and update the agenda status
+ agenda.update_status();
+
+ // fill the ToDo queue with tasks from the agenda
+ while(true) {
+ Task t = agenda.get_task();
+ if (t == NO_TASK)
+ break;
+ toDoQ.push(t);
+ agenda.set_status(t,RUNNING);
+ }
+
+ // take a look in the finished queue and process finished tasks
+ while(true) {
+ Task t = finishedQ.pop();
+ if (t == NO_TASK)
+ break;
+ agenda.set_status(t,FINISHED);
+ }
+
+ // take a look in the failed queue and process failed tasks
+ while(true) {
+ Task t = failedQ.pop();
+ if (t == NO_TASK)
+ break;
+ if (agenda.get_status(t) == RUNNING) {
+ // give it one more chance
+ agenda.set_status(t,RETRY);
+ toDoQ.push(t);
+ }
+ else {
+ agenda.set_status(t,FAILED);
+ std::cout << "Task failed! " << t << std::endl;
+ }
+ }
+
+ // take a small break
+ std::this_thread::sleep_for (std::chrono::seconds(1));
+ }
+
+ // Halt the threads
+ surrender = true;
+ for (auto& w : workers) {
+ w.join();
+ }
+ std::cout << "All threads joined." << std::endl;
+
+ if (write_sam) {
+ std::cout << "Writing SAM files." << std::endl;
+ // Create individual SAM files for every tile
+ TaskQueue sam_tasks;
+ std::vector<Task> tv = agenda.get_SAM_tasks();
+ for ( auto t: tv ) {
+ sam_tasks.push(t);
+ }
+
+ workers.clear();
+ for (int i = 0; i < num_threads; i++) {
+ workers.push_back(std::thread(sam_worker, std::ref(sam_tasks), &settings, index));
+ }
+
+ for (auto& w : workers) {
+ w.join();
+ }
+ }
+
+ std::cout << "Total run time: " << time(NULL) - t_start << " s" << std::endl;
+
+ delete index;
+
+ return 1;
+}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/hilive.git
More information about the debian-med-commit
mailing list