[med-svn] [kmc] 02/05: Imported Upstream version 2.3+dfsg
Sascha Steinbiss
sascha at steinbiss.name
Sat Dec 19 22:47:43 UTC 2015
This is an automated email from the git hooks/post-receive script.
sascha-guest pushed a commit to branch master
in repository kmc.
commit d1443eccf8995dd8e7bcb23a936ba01a2b0d614d
Author: Sascha Steinbiss <sascha at steinbiss.name>
Date: Sat Dec 19 14:54:48 2015 +0000
Imported Upstream version 2.3+dfsg
---
kmc_api/kmc_file.cpp | 405 ++++-
kmc_api/kmc_file.h | 53 +-
kmc_api/kmer_api.cpp | 4 +-
kmc_api/kmer_api.h | 40 +-
kmc_api/kmer_defs.h | 8 +-
kmc_api/mmer.cpp | 4 +-
kmc_api/mmer.h | 4 +-
kmc_dump/kmc_dump.cpp | 8 +-
kmc_dump/nc_utils.cpp | 4 +-
kmc_dump/nc_utils.h | 4 +-
kmc_dump_sample/kmc_dump_sample.cpp | 6 +-
kmc_tools.pdf | Bin 0 -> 132755 bytes
{kmer_counter => kmc_tools}/asmlib_wrapper.h | 7 +-
kmc_tools/bundle.h | 233 +++
kmc_tools/config.h | 550 ++++++
kmc_tools/defs.h | 92 ++
kmc_tools/dump_writer.h | 174 ++
kmc_tools/expression_node.h | 198 +++
kmc_tools/fastq_filter.cpp | 364 ++++
kmc_tools/fastq_filter.h | 83 +
{kmer_counter => kmc_tools}/fastq_reader.cpp | 157 +-
{kmer_counter => kmc_tools}/fastq_reader.h | 53 +-
kmc_tools/fastq_writer.cpp | 69 +
kmc_tools/fastq_writer.h | 43 +
kmc_tools/histogram_writer.h | 54 +
kmc_tools/kmc1_db_reader.h | 379 +++++
kmc_tools/kmc1_db_writer.h | 380 +++++
kmc_tools/kmc2_db_reader.h | 1398 ++++++++++++++++
kmc_tools/kmc_header.cpp | 88 +
kmc_tools/kmc_header.h | 55 +
kmc_tools/kmc_tools.cpp | 360 ++++
.../kmc_tools.vcxproj | 135 +-
kmc_tools/kmer.h | 522 ++++++
kmc_tools/libs/alibcof64.lib | Bin 0 -> 45248 bytes
kmc_tools/libs/alibelf64.a | Bin 0 -> 61206 bytes
kmc_tools/libs/asmlib.h | 265 +++
kmc_tools/libs/bzlib.h | 282 ++++
kmc_tools/libs/bzlib_private.h | 509 ++++++
kmc_tools/libs/libamac64.a | Bin 0 -> 51960 bytes
kmc_tools/libs/libbz2.1.0.5.dylib | Bin 0 -> 16696 bytes
kmc_tools/libs/libbz2.a | Bin 0 -> 349570 bytes
kmc_tools/libs/libbzip2.lib | Bin 0 -> 290012 bytes
kmc_tools/libs/libz.1.2.5.dylib | Bin 0 -> 17592 bytes
kmc_tools/libs/libz.a | Bin 0 -> 134978 bytes
kmc_tools/libs/zconf.h | 506 ++++++
kmc_tools/libs/zlib.h | 1744 ++++++++++++++++++++
kmc_tools/libs/zlibstat.lib | Bin 0 -> 768146 bytes
{kmer_counter => kmc_tools}/meta_oper.h | 18 +-
kmc_tools/nc_utils.cpp | 17 +
{kmc_dump => kmc_tools}/nc_utils.h | 34 +-
kmc_tools/operations.h | 284 ++++
kmc_tools/output_parser.h | 174 ++
kmc_tools/parameters_parser.cpp | 536 ++++++
kmc_tools/parameters_parser.h | 111 ++
kmc_tools/parser.cpp | 184 +++
kmc_tools/parser.h | 107 ++
kmc_tools/percent_progress.cpp | 93 ++
kmc_tools/percent_progress.h | 48 +
kmc_tools/queues.h | 393 +++++
kmc_tools/stdafx.cpp | 8 +
kmc_tools/stdafx.h | 17 +
kmc_tools/targetver.h | 8 +
kmc_tools/timer.h | 33 +
kmc_tools/tokenizer.cpp | 77 +
kmc_tools/tokenizer.h | 39 +
kmer_counter.sln | 13 +
kmer_counter/asmlib_wrapper.h | 4 +-
kmer_counter/bkb_merger.h | 8 +-
kmer_counter/bkb_reader.cpp | 4 +-
kmer_counter/bkb_reader.h | 4 +-
kmer_counter/bkb_sorter.h | 8 +-
kmer_counter/bkb_subbin.h | 4 +-
kmer_counter/bkb_uncompactor.h | 4 +-
kmer_counter/bkb_writer.cpp | 4 +-
kmer_counter/bkb_writer.h | 4 +-
kmer_counter/defs.h | 13 +-
kmer_counter/fastq_reader.cpp | 4 +-
kmer_counter/fastq_reader.h | 4 +-
kmer_counter/kb_collector.h | 4 +-
kmer_counter/kb_completer.cpp | 18 +-
kmer_counter/kb_completer.h | 216 ++-
kmer_counter/kb_reader.h | 8 +-
kmer_counter/kb_sorter.h | 93 +-
kmer_counter/kb_storer.cpp | 4 +-
kmer_counter/kb_storer.h | 4 +-
kmer_counter/kmc.h | 256 ++-
kmer_counter/kmer.cpp | 4 +-
kmer_counter/kmer.h | 4 +-
kmer_counter/kmer_counter.cpp | 23 +-
kmer_counter/kmer_counter.vcxproj | 5 +-
kmer_counter/kxmer_set.h | 4 +-
kmer_counter/mem_disk_file.cpp | 4 +-
kmer_counter/mem_disk_file.h | 4 +-
kmer_counter/meta_oper.h | 4 +-
kmer_counter/mmer.cpp | 4 +-
kmer_counter/mmer.h | 4 +-
kmer_counter/params.h | 18 +-
kmer_counter/prob_qual.cpp | 67 +
kmer_counter/prob_qual.h | 20 +
kmer_counter/queues.h | 20 +-
kmer_counter/radix.cpp | 4 +-
kmer_counter/radix.h | 4 +-
kmer_counter/rev_byte.cpp | 4 +-
kmer_counter/rev_byte.h | 4 +-
kmer_counter/s_mapper.h | 4 +-
kmer_counter/small_k_buf.h | 43 +
kmer_counter/splitter.h | 422 ++++-
kmer_counter/timer.cpp | 4 +-
kmer_counter/timer.h | 4 +-
makefile | 46 +-
makefile_mac | 47 +-
111 files changed, 12317 insertions(+), 556 deletions(-)
diff --git a/kmc_api/kmc_file.cpp b/kmc_api/kmc_file.cpp
index c4c674c..462aea8 100644
--- a/kmc_api/kmc_file.cpp
+++ b/kmc_api/kmc_file.cpp
@@ -4,8 +4,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#include "stdafx.h"
@@ -205,6 +205,8 @@ bool CKMCFile::ReadParamsFrom_prefix_file_buf(uint64 &size)
result = fread(&max_count, 1, sizeof(uint32), file_pre);
original_max_count = max_count;
result = fread(&total_kmers, 1, sizeof(uint64), file_pre);
+ result = fread(&both_strands, 1, 1, file_pre);
+ both_strands = !both_strands;
signature_map_size = ((1 << (2 * signature_len)) + 1);
uint64 lut_area_size_in_bytes = size - (signature_map_size * sizeof(uint32)+header_offset + 8);
@@ -256,7 +258,7 @@ bool CKMCFile::ReadParamsFrom_prefix_file_buf(uint64 &size)
header_index++;
counter_size = (uint32)prefix_file_buf[header_index]; //- the size of a counter in bytes;
- //- for mode 0 counter_size is 1, 2, 3, or 4
+ //- for mode 0 counter_size is 1, 2, 3, or 4 (or 5, 6, 7, 8 for small k values)
//- for mode = 1 counter_size is 4;
lut_prefix_length = prefix_file_buf[header_index] >> 32; //- the number of prefix's symbols cut frm kmers;
//- (kmer_length - lut_prefix_length) is divisible by 4
@@ -265,11 +267,18 @@ bool CKMCFile::ReadParamsFrom_prefix_file_buf(uint64 &size)
original_min_count = (uint32)prefix_file_buf[header_index]; //- the minimal number of kmer's appearances
min_count = original_min_count;
original_max_count = prefix_file_buf[header_index] >> 32; //- the maximal number of kmer's appearances
- max_count = original_max_count;
+ //max_count = original_max_count;
header_index++;
total_kmers = prefix_file_buf[header_index]; //- the total number of kmers
+ header_index++;
+ both_strands = (prefix_file_buf[header_index] & 0x000000000000000F) == 1;
+ both_strands = !both_strands;
+
+ original_max_count += prefix_file_buf[header_index] & 0xFFFFFFFF00000000;
+ max_count = original_max_count;
+
prefix_file_buf[last_data_index] = total_kmers + 1;
sufix_size = (kmer_length - lut_prefix_length) / 4;
@@ -340,7 +349,50 @@ bool CKMCFile::CheckKmer(CKmerAPI &kmer, uint32 &count)
index_start = prefix_file_buf[pattern_prefix_value];
index_stop = prefix_file_buf[pattern_prefix_value + 1] - 1;
}
+ uint64 tmp_count ;
+ bool res = BinarySearch(index_start, index_stop, kmer, tmp_count, pattern_offset);
+ count = (uint32)tmp_count;
+ return res;
+}
+//------------------------------------------------------------------------------------------
+// Check if kmer exists.
+// IN : kmer - kmer
+// OUT: count - kmer's counter if kmer exists
+// RET: true - if kmer exists
+//------------------------------------------------------------------------------------------
+bool CKMCFile::CheckKmer(CKmerAPI &kmer, uint64 &count)
+{
+ if (is_opened != opened_for_RA)
+ return false;
+ if (end_of_file)
+ return false;
+
+ //recognize a prefix:
+ uint64 pattern_prefix_value = kmer.kmer_data[0];
+
+ uint32 pattern_offset = (sizeof(pattern_prefix_value)* 8) - (lut_prefix_length * 2) - (kmer.byte_alignment * 2);
+ int64 index_start = 0, index_stop = 0;
+
+ pattern_prefix_value = pattern_prefix_value >> pattern_offset; //complements with 0
+ if (pattern_prefix_value >= prefix_file_buf_size)
+ return false;
+
+ if (kmc_version == 0x200)
+ {
+ uint32 signature = kmer.get_signature(signature_len);
+ uint32 bin_start_pos = signature_map[signature];
+ bin_start_pos *= single_LUT_size;
+ //look into the array with data
+ index_start = *(prefix_file_buf + bin_start_pos + pattern_prefix_value);
+ index_stop = *(prefix_file_buf + bin_start_pos + pattern_prefix_value + 1) - 1;
+ }
+ else if (kmc_version == 0)
+ {
+ //look into the array with data
+ index_start = prefix_file_buf[pattern_prefix_value];
+ index_stop = prefix_file_buf[pattern_prefix_value + 1] - 1;
+ }
return BinarySearch(index_start, index_stop, kmer, count, pattern_offset);
}
@@ -461,6 +513,92 @@ bool CKMCFile::ReadNextKmer(CKmerAPI &kmer, uint32 &count)
return true;
}
+
+
+//-----------------------------------------------------------------------------------------------
+// Read next kmer
+// OUT: kmer - next kmer
+// OUT: count - kmer's counter
+// RET: true - if not EOF
+//-----------------------------------------------------------------------------------------------
+bool CKMCFile::ReadNextKmer(CKmerAPI &kmer, uint64 &count)
+{
+ uint64 prefix_mask = (1 << 2 * lut_prefix_length) - 1; //for kmc2 db
+
+ if (is_opened != opened_for_listing)
+ return false;
+ do
+ {
+ if (end_of_file)
+ return false;
+
+ if (sufix_number == prefix_file_buf[prefix_index + 1])
+ {
+ prefix_index++;
+
+ while (prefix_file_buf[prefix_index] == prefix_file_buf[prefix_index + 1])
+ prefix_index++;
+ }
+
+ uint32 off = (sizeof(prefix_index)* 8) - (lut_prefix_length * 2) - kmer.byte_alignment * 2;
+
+ uint64 temp_prefix = (prefix_index & prefix_mask) << off; // shift prefix towards MSD. "& prefix_mask" necessary for kmc2 db format
+
+ kmer.kmer_data[0] = temp_prefix; // store prefix in an object CKmerAPI
+
+ for (uint32 i = 1; i < kmer.no_of_rows; i++)
+ kmer.kmer_data[i] = 0;
+
+ //read sufix:
+ uint32 row_index = 0;
+ uint64 suf = 0;
+
+ off = off - 8;
+
+ for (uint32 a = 0; a < sufix_size; a++)
+ {
+ if (index_in_partial_buf == part_size)
+ Reload_sufix_file_buf();
+
+ suf = sufix_file_buf[index_in_partial_buf++];
+ suf = suf << off;
+ kmer.kmer_data[row_index] = kmer.kmer_data[row_index] | suf;
+
+ if (off == 0) //the end of a word in kmer_data
+ {
+ off = 56;
+ row_index++;
+ }
+ else
+ off -= 8;
+ }
+
+ //read counter:
+ if (index_in_partial_buf == part_size)
+ Reload_sufix_file_buf();
+
+ count = sufix_file_buf[index_in_partial_buf++];
+
+ for (uint32 b = 1; b < counter_size; b++)
+ {
+ if (index_in_partial_buf == part_size)
+ Reload_sufix_file_buf();
+
+ uint64 aux = 0x000000ff & sufix_file_buf[index_in_partial_buf++];
+ aux = aux << 8 * (b);
+ count = aux | count;
+ }
+
+ sufix_number++;
+
+ if (sufix_number == total_kmers)
+ end_of_file = true;
+
+ } while ((count < min_count) || (count > max_count));
+
+ return true;
+}
+
//-------------------------------------------------------------------------------
// Reload a contents of an array "sufix_file_buf" for listing mode. Auxiliary function.
//-------------------------------------------------------------------------------
@@ -570,12 +708,23 @@ bool CKMCFile::SetMaxCount(uint32 x)
// Return a value of max_count. Kmers with counters above this theshold are ignored
// RET : a value of max_count
//----------------------------------------------------------------------------------------
-uint32 CKMCFile::GetMaxCount(void)
+uint64 CKMCFile::GetMaxCount(void)
{
return max_count;
}
//----------------------------------------------------------------------------------------
+// Return true if KMC was run without -b switch
+// RET : a value of both_strands
+//----------------------------------------------------------------------------------------
+bool CKMCFile::GetBothStrands(void)
+{
+ return both_strands;
+}
+
+
+
+//----------------------------------------------------------------------------------------
// Set original (readed from *.kmer_pre) values for min_count and max_count
//----------------------------------------------------------------------------------------
void CKMCFile::ResetMinMaxCounts(void)
@@ -678,7 +827,7 @@ uint64 CKMCFile::KmerCount(void)
// _total_kmers - the total number of kmers
// RET : true if kmer_database has been opened
//---------------------------------------------------------------------------------
-bool CKMCFile::Info(uint32 &_kmer_length, uint32 &_mode, uint32 &_counter_size, uint32 &_lut_prefix_length, uint32 &_signature_len, uint32 &_min_count, uint32 &_max_count, uint64 &_total_kmers)
+bool CKMCFile::Info(uint32 &_kmer_length, uint32 &_mode, uint32 &_counter_size, uint32 &_lut_prefix_length, uint32 &_signature_len, uint32 &_min_count, uint64 &_max_count, uint64 &_total_kmers)
{
if(is_opened)
{
@@ -698,21 +847,60 @@ bool CKMCFile::Info(uint32 &_kmer_length, uint32 &_mode, uint32 &_counter_size,
return false;
};
+// Get current parameters from kmer_database
+bool CKMCFile::Info(CKMCFileInfo& info)
+{
+ if (is_opened)
+ {
+ info.kmer_length = kmer_length;
+ info.mode = mode;
+ info.counter_size = counter_size;
+ info.lut_prefix_length = lut_prefix_length;
+ if (kmc_version == 0x200)
+ info.signature_len = signature_len;
+ else
+ info.signature_len = 0; //for kmc1 there is no signature_len
+ info.min_count = min_count;
+ info.max_count = max_count;
+ info.total_kmers = total_kmers;
+ info.both_strands = both_strands;
+ return true;
+ }
+ return false;
+}
+
//---------------------------------------------------------------------------------
// Get counters from read
// OUT : counters - vector of counters of each k-mer in read (of size read_len - kmer_len + 1), if some k-mer is invalid (i.e. contains 'N') the counter is equal to 0
// IN : read -
-// RET : true if success
+// RET : true if success, false if k > read length or some failure
//---------------------------------------------------------------------------------
bool CKMCFile::GetCountersForRead(const std::string& read, std::vector<uint32>& counters)
{
if (is_opened != opened_for_RA)
return false;
+
+ if (read.length() < kmer_length)
+ {
+ counters.clear();
+ return false;
+ }
+
if (kmc_version == 0x200)
- return GetCountersForRead_kmc2(read, counters);
+ {
+ if (both_strands)
+ return GetCountersForRead_kmc2_both_strands(read, counters);
+ else
+ return GetCountersForRead_kmc2(read, counters);
+ }
else if (kmc_version == 0)
- return GetCountersForRead_kmc1(read, counters);
+ {
+ if (both_strands)
+ return GetCountersForRead_kmc1_both_strands(read,counters);
+ else
+ return GetCountersForRead_kmc1(read, counters);
+ }
else
return false; //never should be here
}
@@ -767,9 +955,9 @@ uint32 CKMCFile::count_for_kmer_kmc1(CKmerAPI& kmer)
int64 index_start = prefix_file_buf[pattern_prefix_value];
int64 index_stop = prefix_file_buf[pattern_prefix_value + 1] - 1;
- uint32 counter = 0;
+ uint64 counter = 0;
if (BinarySearch(index_start, index_stop, kmer, counter, pattern_offset))
- return counter;
+ return (uint32)counter;
return 0;
}
@@ -791,15 +979,93 @@ uint32 CKMCFile::count_for_kmer_kmc2(CKmerAPI& kmer, uint32 bin_start_pos)
int64 index_start = *(prefix_file_buf + bin_start_pos + pattern_prefix_value);
int64 index_stop = *(prefix_file_buf + bin_start_pos + pattern_prefix_value + 1) - 1;
- uint32 counter = 0;
+ uint64 counter = 0;
if (BinarySearch(index_start, index_stop, kmer, counter, pattern_offset))
- return counter;
+ return (uint32)counter;
return 0;
}
//---------------------------------------------------------------------------------
// Auxiliary function.
//---------------------------------------------------------------------------------
+bool CKMCFile::GetCountersForRead_kmc1_both_strands(const std::string& read, std::vector<uint32>& counters)
+{
+ uint32 read_len = static_cast<uint32>(read.length());
+ counters.resize(read.length() - kmer_length + 1);
+ std::string transformed_read = read;
+ for (char& c : transformed_read)
+ c = CKmerAPI::num_codes[(uchar)c];
+
+ uint32 i = 0;
+ CKmerAPI kmer(kmer_length), kmer_rev(kmer_length);
+ uint32 pos = 0;
+ uint32 rev_pos = kmer_length - 1;
+
+ uint32 counters_pos = 0;
+
+ while (i + kmer_length - 1 < read_len)
+ {
+ bool contains_N = false;
+ while (i < read_len && pos < kmer_length)
+ {
+ if (CKmerAPI::num_codes[(uchar)read[i]] < 0)
+ {
+ pos = 0;
+ rev_pos = kmer_length - 1;
+ kmer.clear();
+ kmer_rev.clear();
+ ++i;
+ uint32 wrong_kmers = MIN(i - counters_pos, static_cast<uint32>(counters.size()) - counters_pos);
+ fill_n(counters.begin() + counters_pos, wrong_kmers, 0);
+ counters_pos += wrong_kmers;
+ contains_N = true;
+ break;
+ }
+ else
+ {
+ kmer_rev.insert2bits(rev_pos--, 3 - CKmerAPI::num_codes[(uchar)read[i]]);
+ kmer.insert2bits(pos++, CKmerAPI::num_codes[(uchar)read[i++]]);
+
+ }
+ }
+ if (contains_N)
+ continue;
+ if (pos == kmer_length)
+ {
+ if(kmer < kmer_rev)
+ counters[counters_pos++] = count_for_kmer_kmc1(kmer);
+ else
+ counters[counters_pos++] = count_for_kmer_kmc1(kmer_rev);
+ }
+ else
+ break;
+
+ while (i < read_len)
+ {
+ if (CKmerAPI::num_codes[(uchar)read[i]] < 0)
+ {
+ pos = 0;
+ break;
+ }
+ kmer_rev.SHR_insert2bits(3 - CKmerAPI::num_codes[(uchar)read[i]]);
+ kmer.SHL_insert2bits(CKmerAPI::num_codes[(uchar)read[i++]]);
+ if(kmer < kmer_rev)
+ counters[counters_pos++] = count_for_kmer_kmc1(kmer);
+ else
+ counters[counters_pos++] = count_for_kmer_kmc1(kmer_rev);
+ }
+ }
+ if (counters_pos < counters.size())
+ {
+ fill_n(counters.begin() + counters_pos, counters.size() - counters_pos, 0);
+ counters_pos = static_cast<uint32>(counters.size());
+ }
+ return true;
+}
+
+//---------------------------------------------------------------------------------
+// Auxiliary function.
+//---------------------------------------------------------------------------------
bool CKMCFile::GetCountersForRead_kmc1(const std::string& read, std::vector<uint32>& counters)
{
uint32 read_len = static_cast<uint32>(read.length());
@@ -860,24 +1126,18 @@ bool CKMCFile::GetCountersForRead_kmc1(const std::string& read, std::vector<uint
}
return true;
}
+
//---------------------------------------------------------------------------------
// Auxiliary function.
//---------------------------------------------------------------------------------
-bool CKMCFile::GetCountersForRead_kmc2(const std::string& read, std::vector<uint32>& counters)
-{
-counters.resize(read.length() - kmer_length + 1);
- std::string transformed_read = read;
- for (char& c : transformed_read)
- c = CKmerAPI::num_codes[(uchar)c];
+void CKMCFile::GetSuperKmers(const std::string& transformed_read, super_kmers_t& super_kmers)
+{
uint32 i = 0;
uint32 len = 0; //length of super k-mer
uint32 signature_start_pos;
CMmer current_signature(signature_len), end_mmer(signature_len);
- using super_kmers_t = std::vector<std::tuple<uint32, uint32, uint32>>;//start_pos, len, bin_no,
- super_kmers_t super_kmers;
-
- while (i + kmer_length - 1 < read.length())
+ while (i + kmer_length - 1 < transformed_read.length())
{
bool contains_N = false;
//building first signature after 'N' or at the read beginning
@@ -897,7 +1157,7 @@ counters.resize(read.length() - kmer_length + 1);
}
len = signature_len;
signature_start_pos = i - signature_len;
- current_signature.insert(transformed_read.c_str() + signature_start_pos);
+ current_signature.insert(transformed_read.c_str() + signature_start_pos);
end_mmer.set(current_signature);
for (; i < transformed_read.length(); ++i)
@@ -954,7 +1214,93 @@ counters.resize(read.length() - kmer_length + 1);
{
super_kmers.push_back(std::make_tuple(i - len, len, signature_map[current_signature.get()]));
}
+}
+
+//---------------------------------------------------------------------------------
+// Auxiliary function.
+//---------------------------------------------------------------------------------
+bool CKMCFile::GetCountersForRead_kmc2_both_strands(const std::string& read, std::vector<uint32>& counters)
+{
+ counters.resize(read.length() - kmer_length + 1);
+ std::string transformed_read = read;
+ for (char& c : transformed_read)
+ c = CKmerAPI::num_codes[(uchar)c];
+
+ super_kmers_t super_kmers;
+ GetSuperKmers(transformed_read, super_kmers);
+
+ uint32 counters_pos = 0;
+ if (super_kmers.empty())
+ {
+ fill_n(counters.begin(), counters.size(), 0);
+ return true;
+ }
+
+ CKmerAPI kmer(kmer_length), rev_kmer(kmer_length);
+
+ uint32 last_end = 0;
+
+ //'N' somewhere in first k-mer
+ if (std::get<0>(super_kmers.front()) > 0)
+ {
+ fill_n(counters.begin(), std::get<0>(super_kmers.front()), 0);
+ last_end = std::get<0>(super_kmers.front());
+ counters_pos = std::get<0>(super_kmers.front());
+ }
+ for (auto& super_kmer : super_kmers)
+ {
+ //'N's between super k-mers
+ if (last_end < std::get<0>(super_kmer))
+ {
+ uint32 gap = std::get<0>(super_kmer) -last_end;
+ fill_n(counters.begin() + counters_pos, kmer_length + gap - 1, 0);
+ counters_pos += kmer_length + gap - 1;
+ }
+ last_end = std::get<0>(super_kmer) +std::get<1>(super_kmer);
+
+ kmer.from_binary(transformed_read.c_str() + std::get<0>(super_kmer));
+ rev_kmer.from_binary_rev(transformed_read.c_str() + std::get<0>(super_kmer));
+ uint32 bin_start_pos = std::get<2>(super_kmer) * single_LUT_size;
+ if(kmer < rev_kmer)
+ counters[counters_pos++] = count_for_kmer_kmc2(kmer, bin_start_pos);
+ else
+ counters[counters_pos++] = count_for_kmer_kmc2(rev_kmer, bin_start_pos);
+
+ for (uint32 i = std::get<0>(super_kmer) +kmer_length; i < std::get<0>(super_kmer) +std::get<1>(super_kmer); ++i)
+ {
+ kmer.SHL_insert2bits(transformed_read[i]);
+ rev_kmer.SHR_insert2bits(3 - transformed_read[i]);
+ if(kmer < rev_kmer)
+ counters[counters_pos++] = count_for_kmer_kmc2(kmer, bin_start_pos);
+ else
+ counters[counters_pos++] = count_for_kmer_kmc2(rev_kmer, bin_start_pos);
+ }
+ }
+ //'N's at the end of read
+ if (counters_pos < counters.size())
+ {
+ fill_n(counters.begin() + counters_pos, counters.size() - counters_pos, 0);
+ counters_pos = static_cast<uint32>(counters.size());
+ }
+
+ return true;
+}
+
+
+//---------------------------------------------------------------------------------
+// Auxiliary function.
+//---------------------------------------------------------------------------------
+bool CKMCFile::GetCountersForRead_kmc2(const std::string& read, std::vector<uint32>& counters)
+{
+ counters.resize(read.length() - kmer_length + 1);
+ std::string transformed_read = read;
+ for (char& c : transformed_read)
+ c = CKmerAPI::num_codes[(uchar)c];
+
+ super_kmers_t super_kmers;
+ GetSuperKmers(transformed_read, super_kmers);
+
uint32 counters_pos = 0;
if (super_kmers.empty())
{
@@ -983,8 +1329,7 @@ counters.resize(read.length() - kmer_length + 1);
counters_pos += kmer_length + gap - 1;
}
last_end = std::get<0>(super_kmer) + std::get<1>(super_kmer);
-
- kmer.clear();
+
kmer.from_binary(transformed_read.c_str() + std::get<0>(super_kmer));
uint32 bin_start_pos = std::get<2>(super_kmer) * single_LUT_size;
@@ -1010,13 +1355,15 @@ counters.resize(read.length() - kmer_length + 1);
//---------------------------------------------------------------------------------
// Auxiliary function.
//---------------------------------------------------------------------------------
-bool CKMCFile::BinarySearch(int64 index_start, int64 index_stop, const CKmerAPI& kmer, uint32& counter, uint32 pattern_offset)
+bool CKMCFile::BinarySearch(int64 index_start, int64 index_stop, const CKmerAPI& kmer, uint64& counter, uint32 pattern_offset)
{
+ if (index_start >= total_kmers)
+ return false;
uchar *sufix_byte_ptr = nullptr;
uint64 sufix = 0;
//sufix_offset is always 56
- uint32 sufix_offset = 56; // the ofset of a sufix is for shifting the sufix towards MSB, to compare the sufix with a pattern
+ uint32 sufix_offset = 56; // the offset of a sufix is for shifting the sufix towards MSB, to compare the sufix with a pattern
// Bytes of a pattern to search are always shifted towards MSB
uint32 row_index = 0; // the number of a current row in an array kmer_data
@@ -1073,7 +1420,7 @@ bool CKMCFile::BinarySearch(int64 index_start, int64 index_stop, const CKmerAPI&
for (uint32 b = 1; b < counter_size; b++)
{
- uint32 aux = 0x000000ff & *(sufix_byte_ptr + b);
+ uint64 aux = 0x000000ff & *(sufix_byte_ptr + b);
aux = aux << 8 * (b);
counter = aux | counter;
diff --git a/kmc_api/kmc_file.h b/kmc_api/kmc_file.h
index 73676f9..19c540c 100644
--- a/kmc_api/kmc_file.h
+++ b/kmc_api/kmc_file.h
@@ -4,9 +4,9 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
-*/
+ Version: 2.3.0
+ Date : 2015-08-21
+ */
#ifndef _KMC_FILE_H
#define _KMC_FILE_H
@@ -16,6 +16,19 @@
#include <string>
#include <vector>
+struct CKMCFileInfo
+{
+ uint32 kmer_length;
+ uint32 mode;
+ uint32 counter_size;
+ uint32 lut_prefix_length;
+ uint32 signature_len;
+ uint32 min_count;
+ uint64 max_count;
+ bool both_strands;
+ uint64 total_kmers;
+};
+
class CKMCFile
{
enum open_mode {closed, opened_for_RA, opened_for_listing};
@@ -44,19 +57,20 @@ class CKMCFile
uint32 lut_prefix_length;
uint32 signature_len;
uint32 min_count;
- uint32 max_count;
+ uint64 max_count;
uint64 total_kmers;
+ bool both_strands;
uint32 kmc_version;
uint32 sufix_size; // sufix's size in bytes
uint32 sufix_rec_size; // sufix_size + counter_size
uint32 original_min_count;
- uint32 original_max_count;
+ uint64 original_max_count;
static uint64 part_size; // the size of a block readed to sufix_file_buf, in listing mode
- bool BinarySearch(int64 index_start, int64 index_stop, const CKmerAPI& kmer, uint32& counter, uint32 pattern_offset);
+ bool BinarySearch(int64 index_start, int64 index_stop, const CKmerAPI& kmer, uint64& counter, uint32 pattern_offset);
// Open a file, recognize its size and check its marker. Auxiliary function.
bool OpenASingleFile(const std::string &file_name, FILE *&file_handler, uint64 &size, char marker[]);
@@ -67,8 +81,17 @@ class CKMCFile
// Reload a contents of an array "sufix_file_buf" for listing mode. Auxiliary function.
void Reload_sufix_file_buf();
- // Implementation of GetCountersForRead for kmc1 database format
- bool GetCountersForRead_kmc1(const std::string& read, std::vector<uint32>& counters);
+ // Implementation of GetCountersForRead for kmc1 database format for both strands
+ bool GetCountersForRead_kmc1_both_strands(const std::string& read, std::vector<uint32>& counters);
+
+ // Implementation of GetCountersForRead for kmc1 database format without choosing canonical k-mer
+ bool GetCountersForRead_kmc1(const std::string& read, std::vector<uint32>& counters);
+
+ using super_kmers_t = std::vector<std::tuple<uint32, uint32, uint32>>;//start_pos, len, bin_no
+ void GetSuperKmers(const std::string& transformed_read, super_kmers_t& super_kmers);
+
+ // Implementation of GetCountersForRead for kmc2 database format for both strands
+ bool GetCountersForRead_kmc2_both_strands(const std::string& read, std::vector<uint32>& counters);
// Implementation of GetCountersForRead for kmc2 database format
bool GetCountersForRead_kmc2(const std::string& read, std::vector<uint32>& counters);
@@ -86,6 +109,8 @@ public:
// Return next kmer in CKmerAPI &kmer. Return its counter in float &count. Return true if not EOF
bool ReadNextKmer(CKmerAPI &kmer, float &count);
+ bool ReadNextKmer(CKmerAPI &kmer, uint64 &count); //for small k-values when counter may be longer than 4bytes
+
bool ReadNextKmer(CKmerAPI &kmer, uint32 &count);
// Release memory and close files in case they were opened
bool Close();
@@ -100,7 +125,10 @@ public:
bool SetMaxCount(uint32 x);
// Return a value of max_count. Kmers with counters above this theshold are ignored
- uint32 GetMaxCount(void);
+ uint64 GetMaxCount(void);
+
+ //Return true if kmc was run without -b switch.
+ bool GetBothStrands(void);
// Return the total number of kmers between min_count and max_count
uint64 KmerCount(void);
@@ -119,6 +147,8 @@ public:
bool CheckKmer(CKmerAPI &kmer, uint32 &count);
+ bool CheckKmer(CKmerAPI &kmer, uint64 &count);
+
// Return true if kmer exists
bool IsKmer(CKmerAPI &kmer);
@@ -126,8 +156,11 @@ public:
void ResetMinMaxCounts(void);
// Get current parameters from kmer_database
- bool Info(uint32 &_kmer_length, uint32 &_mode, uint32 &_counter_size, uint32 &_lut_prefix_length, uint32 &_signature_len, uint32 &_min_count, uint32 &_max_count, uint64 &_total_kmers);
+ bool Info(uint32 &_kmer_length, uint32 &_mode, uint32 &_counter_size, uint32 &_lut_prefix_length, uint32 &_signature_len, uint32 &_min_count, uint64 &_max_count, uint64 &_total_kmers);
+ // Get current parameters from kmer_database
+ bool Info(CKMCFileInfo& info);
+
// Get counters for all k-mers in read
bool GetCountersForRead(const std::string& read, std::vector<uint32>& counters);
bool GetCountersForRead(const std::string& read, std::vector<float>& counters);
diff --git a/kmc_api/kmer_api.cpp b/kmc_api/kmer_api.cpp
index befd9fe..c6158df 100644
--- a/kmc_api/kmer_api.cpp
+++ b/kmc_api/kmer_api.cpp
@@ -4,8 +4,8 @@
Authors: Sebastian Deorowicz and Agnieszka Debudaj-Grabysz
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
diff --git a/kmc_api/kmer_api.h b/kmc_api/kmer_api.h
index e652aa2..115249d 100644
--- a/kmc_api/kmer_api.h
+++ b/kmc_api/kmer_api.h
@@ -4,8 +4,8 @@ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
Authors: Sebastian Deorowicz and Agnieszka Debudaj-Grabysz
-Version: 2.2.0
-Date : 2015-04-15
+Version: 2.3.0
+Date : 2015-08-21
*/
#ifndef _KMER_API_H
@@ -15,6 +15,7 @@ Date : 2015-04-15
#include "kmer_defs.h"
#include <string>
#include <iostream>
+#include <vector>
#include "mmer.h"
class CKMCFile;
@@ -65,6 +66,20 @@ protected:
}
kmer_data[no_of_rows - 1] += (uint64)val << (62 - (((kmer_length - 1 + byte_alignment) & 31) * 2));
}
+
+ //----------------------------------------------------------------------------------
+ inline void SHR_insert2bits(uchar val)
+ {
+ for (uint32 i = no_of_rows - 1; i > 0; --i)
+ {
+ kmer_data[i] >>= 2;
+ kmer_data[i] += kmer_data[i - 1] << 62;
+ }
+ kmer_data[0] >>= 2;
+ kmer_data[no_of_rows - 1] &= ~((1ull << ((32 - (kmer_length + byte_alignment - (no_of_rows - 1) * 32)) * 2)) - 1);//mask falling of symbol
+ kmer_data[0] += ((uint64)val << 62) >> (byte_alignment * 2);
+ }
+
// ----------------------------------------------------------------------------------
inline void from_binary(const char* kmer)
{
@@ -74,6 +89,14 @@ protected:
}
// ----------------------------------------------------------------------------------
+ inline void from_binary_rev(const char* kmer)
+ {
+ clear();
+ for (uint32 i = 0; i < kmer_length; ++i)
+ insert2bits(i, 3 - kmer[kmer_length - i - 1]);
+ }
+
+ // ----------------------------------------------------------------------------------
template<typename RandomAccessIterator>
inline void to_string_impl(RandomAccessIterator iter)
{
@@ -424,6 +447,19 @@ public:
str[kmer_length] = '\0';
};
+
+ inline void to_long(std::vector<uint64>& kmer)
+ {
+ kmer.resize(no_of_rows);
+ uint32 offset = 62 - ((kmer_length - 1 + byte_alignment) & 31) * 2;
+ for (int32 i = no_of_rows - 1; i >= 1; --i)
+ {
+ kmer[i] = kmer_data[i] >> offset;
+ kmer[i] += kmer_data[i - 1] << (64 - offset);
+ }
+ kmer[0] = kmer_data[0] >> offset;
+ }
+
//-----------------------------------------------------------------------
// Convert kmer into string (an alphabet ACGT)
// OUT : str - string kmer
diff --git a/kmc_api/kmer_defs.h b/kmc_api/kmer_defs.h
index e4b2b80..8328822 100644
--- a/kmc_api/kmer_defs.h
+++ b/kmc_api/kmer_defs.h
@@ -4,16 +4,16 @@
Authors: Sebastian Deorowicz and Agnieszka Debudaj-Grabysz
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#ifndef _KMER_DEFS_H
#define _KMER_DEFS_H
-#define KMC_VER "2.2.0"
-#define KMC_DATE "2015-04-15"
+#define KMC_VER "2.3.0"
+#define KMC_DATE "2015-08-21"
#define MIN(x,y) ((x) < (y) ? (x) : (y))
diff --git a/kmc_api/mmer.cpp b/kmc_api/mmer.cpp
index ed3ea11..8f7496d 100644
--- a/kmc_api/mmer.cpp
+++ b/kmc_api/mmer.cpp
@@ -5,8 +5,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#include "../kmc_api/mmer.h"
diff --git a/kmc_api/mmer.h b/kmc_api/mmer.h
index 79187f8..486eff4 100644
--- a/kmc_api/mmer.h
+++ b/kmc_api/mmer.h
@@ -4,8 +4,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#ifndef _MMER_H
diff --git a/kmc_dump/kmc_dump.cpp b/kmc_dump/kmc_dump.cpp
index 807dee6..fe87720 100644
--- a/kmc_dump/kmc_dump.cpp
+++ b/kmc_dump/kmc_dump.cpp
@@ -7,8 +7,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#include "stdafx.h"
@@ -85,7 +85,7 @@ int _tmain(int argc, char* argv[])
uint32 _lut_prefix_length;
uint32 _signature_len;
uint32 _min_count;
- uint32 _max_count;
+ uint64 _max_count;
uint64 _total_kmers;
kmer_data_base.Info(_kmer_length, _mode, _counter_size, _lut_prefix_length, _signature_len, _min_count, _max_count, _total_kmers);
@@ -118,7 +118,7 @@ int _tmain(int argc, char* argv[])
}
else
{
- uint32 counter;
+ uint64 counter;
while (kmer_data_base.ReadNextKmer(kmer_object, counter))
{
kmer_object.to_string(str);
diff --git a/kmc_dump/nc_utils.cpp b/kmc_dump/nc_utils.cpp
index 049ce18..c46b854 100644
--- a/kmc_dump/nc_utils.cpp
+++ b/kmc_dump/nc_utils.cpp
@@ -7,8 +7,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#include "stdafx.h"
diff --git a/kmc_dump/nc_utils.h b/kmc_dump/nc_utils.h
index fd0ebb3..8e85caa 100644
--- a/kmc_dump/nc_utils.h
+++ b/kmc_dump/nc_utils.h
@@ -7,8 +7,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#include <string>
diff --git a/kmc_dump_sample/kmc_dump_sample.cpp b/kmc_dump_sample/kmc_dump_sample.cpp
index afe878e..b7e0417 100644
--- a/kmc_dump_sample/kmc_dump_sample.cpp
+++ b/kmc_dump_sample/kmc_dump_sample.cpp
@@ -7,8 +7,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#include "stdafx.h"
@@ -83,7 +83,7 @@ int _tmain(int argc, char* argv[])
uint32 _lut_prefix_length;
uint32 _signature_len;
uint32 _min_count;
- uint32 _max_count;
+ uint64 _max_count;
uint64 _total_kmers;
kmer_data_base.Info(_kmer_length, _mode, _counter_size, _lut_prefix_length, _signature_len, _min_count, _max_count, _total_kmers);
diff --git a/kmc_tools.pdf b/kmc_tools.pdf
new file mode 100644
index 0000000..eea3bd9
Binary files /dev/null and b/kmc_tools.pdf differ
diff --git a/kmer_counter/asmlib_wrapper.h b/kmc_tools/asmlib_wrapper.h
similarity index 74%
copy from kmer_counter/asmlib_wrapper.h
copy to kmc_tools/asmlib_wrapper.h
index 2fd2794..a93cdbf 100644
--- a/kmer_counter/asmlib_wrapper.h
+++ b/kmc_tools/asmlib_wrapper.h
@@ -2,12 +2,13 @@
This file is a part of KMC software distributed under GNU GPL 3 licence.
The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
- Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
+ Authors: Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
+
#ifndef _ASMLIB_WRAPPER_H
#define _ASMLIB_WRAPPER_H
diff --git a/kmc_tools/bundle.h b/kmc_tools/bundle.h
new file mode 100644
index 0000000..39db63b
--- /dev/null
+++ b/kmc_tools/bundle.h
@@ -0,0 +1,233 @@
+/*
+ This file is a part of KMC software distributed under GNU GPL 3 licence.
+ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+ Authors: Marek Kokot
+
+ Version: 2.3.0
+ Date : 2015-08-21
+*/
+
+#ifndef _BUNDLE_H
+#define _BUNDLE_H
+#include "defs.h"
+#include "kmer.h"
+
+
+//************************************************************************************************************
+// CBundle and CInput are CORE classes of this application. CInputs are nodes of binary tree which
+// represent operations. Leafs of this tree are kmc database (1 or 2) inputs (sets of k-mers).
+// Each node represents an operation like intersection, subtraction, etc. Because this class is abstract
+// calling virtual method to get each single k-mer may be costly. To prevent high const, between tree nodes there
+//are instances of CBundle which contains buffer of k-mers and its counters.
+//
+// The algorithm works as follow (conceptually):
+// Build a tree with CBundles and CInputs, as a root take a some output writer (kmc database).
+// Root has its bundle and get from it k-mers, but at the beginning there is nothing in bundle. Each bundle
+// contains pointer to CInput below in tree. The CBundle is getting k-mers from its CInput.
+// This is repeated from top of tree to leafs
+//************************************************************************************************************
+
+//Forward declaration
+template<unsigned SIZE> class CBundle;
+
+//************************************************************************************************************
+// CInput - Base abstract class representing data source for CBundle class
+//************************************************************************************************************
+template<unsigned SIZE> class CInput
+{
+public:
+ virtual void NextBundle(CBundle<SIZE>& bundle) = 0;
+ virtual void IgnoreRest() = 0;
+ bool Finished(){ return finished; }
+ virtual ~CInput(){}
+protected:
+ bool finished = false;
+
+};
+
+
+
+//************************************************************************************************************
+// CBundleData - class containing a buffer of k-mers and its counters.
+//************************************************************************************************************
+template<unsigned SIZE> class CBundleData
+{
+public:
+ CBundleData() : insert_pos(0), get_pos(0), size(BUNDLE_CAPACITY)
+ {
+ kmers = new CKmer<SIZE>[size];
+ counters = new uint32[size];
+ }
+ ~CBundleData()
+ {
+ delete[] kmers;
+ delete[] counters;
+ }
+ CBundleData(CBundleData<SIZE>&& rhs):
+ insert_pos(rhs.insert_pos), get_pos(rhs.get_pos), size(rhs.size), kmers(rhs.kmers), counters(rhs.counters)
+ {
+ rhs.counters = nullptr;
+ rhs.kmers = nullptr;
+ rhs.get_pos = rhs.size = rhs.insert_pos = 0;
+ }
+
+ CBundleData<SIZE>& operator=(CBundleData<SIZE>&& rhs)
+ {
+ if (this != &rhs)
+ {
+ delete[] kmers;
+ delete[] counters;
+
+ kmers = rhs.kmers;
+ counters = rhs.counters;
+ get_pos = rhs.get_pos;
+ size = rhs.size;
+ insert_pos = rhs.insert_pos;
+
+ rhs.counters = nullptr;
+ rhs.kmers = nullptr;
+ rhs.get_pos = rhs.size = rhs.insert_pos = 0;
+ }
+ return *this;
+ }
+
+ CBundleData(const CBundleData<SIZE>&) = delete;
+ CBundle<SIZE>& operator=(const CBundleData<SIZE>&) = delete;
+
+ CKmer<SIZE>& TopKmer() const
+ {
+ return kmers[get_pos];
+ }
+
+ uint32& TopCounter() const
+ {
+ return counters[get_pos];
+ }
+
+ bool Full()
+ {
+ return insert_pos >= size;
+ }
+
+ bool Empty()
+ {
+ return get_pos >= insert_pos;
+ }
+
+ void Insert(CKmer<SIZE>& kmer, uint32 counter)
+ {
+ kmers[insert_pos] = kmer;
+ counters[insert_pos++] = counter;
+ }
+ void Pop()
+ {
+ ++get_pos;
+ }
+
+ void Clear()
+ {
+ insert_pos = get_pos = 0;
+ }
+
+private:
+ friend class CBundle<SIZE>;
+ uint32 insert_pos, get_pos, size;
+ CKmer<SIZE>* kmers;
+ uint32* counters;
+};
+
+
+
+//************************************************************************************************************
+// CBundle - connector between CBundleData and CInput
+//************************************************************************************************************
+template<unsigned SIZE> class CBundle
+{
+public:
+ CBundle(CInput<SIZE>* input) : input(input)
+ {
+
+ }
+
+ CKmer<SIZE>& TopKmer() const
+ {
+ return data.TopKmer();
+ }
+
+ uint32& TopCounter() const
+ {
+ return data.TopCounter();
+ }
+
+
+ bool Full()
+ {
+ return data.Full();
+ }
+ void Insert(CKmer<SIZE>& kmer, uint32 counter)
+ {
+ data.Insert(kmer, counter);
+ }
+ void Pop()
+ {
+ data.Pop();
+ }
+ ~CBundle()
+ {
+ delete input;
+ }
+
+ bool Empty()
+ {
+ return data.Empty();
+ }
+
+ CBundleData<SIZE>& Data() {
+ return data;
+ }
+
+ inline bool Finished();
+
+ void IgnoreRest()
+ {
+ input->IgnoreRest();
+ }
+ uint32 Size()
+ {
+ return data.insert_pos;
+ }
+
+private:
+ CBundleData<SIZE> data;
+ CInput<SIZE>* input;
+ bool finished = false;
+};
+
+//************************************************************************************************************
+template<unsigned SIZE> inline bool CBundle<SIZE>::Finished()
+{
+ if (finished)
+ return true;
+ if (data.get_pos >= data.insert_pos)
+ {
+ if (input->Finished())
+ {
+ finished = true;
+ return true;
+ }
+ data.get_pos = data.insert_pos = 0;
+ input->NextBundle(*this);
+ if (data.insert_pos == 0)//Because maybe NextBundle did not add anything, which means there is nothing to take
+ {
+ finished = true;
+ return true;
+ }
+ }
+ return false;
+}
+
+#endif
+
+
+// ***** EOF
\ No newline at end of file
diff --git a/kmc_tools/config.h b/kmc_tools/config.h
new file mode 100644
index 0000000..3c26e4c
--- /dev/null
+++ b/kmc_tools/config.h
@@ -0,0 +1,550 @@
+/*
+ This file is a part of KMC software distributed under GNU GPL 3 licence.
+ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+ Authors: Marek Kokot
+
+ Version: 2.3.0
+ Date : 2015-08-21
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+
+#include "defs.h"
+#include <string>
+#include <vector>
+#include <memory>
+#include "kmc_header.h"
+#include "percent_progress.h"
+#include "queues.h"
+
+struct CDescBase
+{
+ std::string file_src;
+ uint32 cutoff_min = 0; //0 means it is not set yet
+ uint32 cutoff_max = 0; //0 means it is not set yet
+ CDescBase(const std::string& file_src) :
+ file_src(file_src)
+ {
+ }
+ CDescBase() = default;
+};
+
+//************************************************************************************************************
+// CInputDesc - description of a single input KMC database.
+//************************************************************************************************************
+struct CInputDesc : public CDescBase
+{
+ uint32 threads = 0; //for kmc2 input
+ CInputDesc(const std::string& file_src) :
+ CDescBase(file_src)
+ {
+
+ }
+ CInputDesc() = default;
+};
+
+//************************************************************************************************************
+// COutputDesc - description of a output KMC database.
+//************************************************************************************************************
+struct COutputDesc : public CDescBase
+{
+ uint32 counter_max = 0; //0 means it is not set yet
+ COutputDesc(const std::string& file_src) :
+ CDescBase(file_src)
+ {
+
+ }
+
+ COutputDesc() = default;
+};
+
+struct CFilteringParams
+{
+ enum class file_type { fasta, fastq };
+ uint32 n_readers;
+ uint32 n_filters;
+ int fastq_buffer_size;
+ int64 mem_part_pmm_fastq_reader;
+ int64 mem_tot_pmm_fastq_reader;
+
+ int64 mem_part_pmm_fastq_filter;
+ int64 mem_tot_pmm_fastq_filter;
+
+ uint32 kmer_len;
+ uint32 gzip_buffer_size = 64 << 20;
+ uint32 bzip2_buffer_size = 64 << 20;
+
+ std::vector<std::string> input_srcs;
+ bool use_float_value = false;
+ uint32 n_min_kmers = 2;
+ uint32 n_max_kmers = 1000000000;
+ float f_min_kmers = 0.0f;
+ float f_max_kmers = 1.0f;
+ file_type input_file_type = file_type::fastq;
+ file_type output_file_type = file_type::fastq;
+ std::string output_src;
+};
+
+struct CDumpParams
+{
+ bool sorted_output = false;
+};
+
+
+
+struct CFilteringQueues
+{
+ CInputFilesQueue *input_files_queue;
+ CPartQueue *input_part_queue, *filtered_part_queue;
+ CMemoryPool *pmm_fastq_reader;
+ CMemoryPool *pmm_fastq_filter;
+};
+
+
+//************************************************************************************************************
+// CConfig - configuration of current application run. Singleton class.
+//************************************************************************************************************
+class CConfig
+{
+public:
+ enum class Mode { UNDEFINED, INTERSECTION, KMERS_SUBTRACT, COUNTERS_SUBTRACT, UNION, COMPLEX, SORT, REDUCE, COMPACT, HISTOGRAM, DUMP, COMPARE, FILTER };
+ uint32 avaiable_threads;
+ uint32 kmer_len = 0;
+ Mode mode = Mode::UNDEFINED;
+ bool verbose = false;
+ std::vector<CInputDesc> input_desc;
+ std::vector<CKMC_header> headers;
+ COutputDesc output_desc;
+
+ CFilteringParams filtering_params; //for filter operation only
+ CDumpParams dump_params; //for dump operation only
+
+ CPercentProgress percent_progress;
+
+ static CConfig& GetInstance()
+ {
+ static CConfig config;
+ return config;
+ }
+ CConfig(const CConfig&) = delete;
+ CConfig& operator=(const CConfig&) = delete;
+
+ bool Is2ArgOper()
+ {
+ return mode == Mode::UNION || mode == Mode::KMERS_SUBTRACT || mode == Mode::COUNTERS_SUBTRACT || mode == Mode::INTERSECTION || mode == Mode::COMPARE;
+ }
+
+ bool IsComplex()
+ {
+ return mode == Mode::COMPLEX;
+ }
+
+ bool Is1ArgOper()
+ {
+ return mode == Mode::SORT || mode == Mode::REDUCE || mode == Mode::COMPACT || mode == Mode::HISTOGRAM || mode == Mode::DUMP;
+ }
+
+ std::string GetOperationName()
+ {
+ switch (mode)
+ {
+ case CConfig::Mode::UNDEFINED:
+ return "";
+ case CConfig::Mode::INTERSECTION:
+ return "intersect";
+ case CConfig::Mode::KMERS_SUBTRACT:
+ return "kmers_subtract";
+ case CConfig::Mode::COUNTERS_SUBTRACT:
+ return "counters_subtract";
+ case CConfig::Mode::UNION:
+ return "union";
+ case CConfig::Mode::COMPLEX:
+ return "complex";
+ case CConfig::Mode::SORT:
+ return "sort";
+ case CConfig::Mode::REDUCE:
+ return "reduce";
+ case CConfig::Mode::COMPACT:
+ return "compact";
+ case CConfig::Mode::HISTOGRAM:
+ return "histogram";
+ case CConfig::Mode::DUMP:
+ return "dump";
+ case CConfig::Mode::COMPARE:
+ return "compare";
+ case CConfig::Mode::FILTER:
+ return "filter";
+ default:
+ return "";
+ }
+ }
+
+
+private:
+ CConfig() = default;
+};
+
+
+
+class CUsageDisplayer
+{
+protected:
+ std::string name;
+ bool is2ArgOper = false;
+ bool is1ArgOper = false;
+ CUsageDisplayer(const std::string& name) :name(name){}
+ void Display2ArgGeneral() const
+ {
+ std::cout << "The '" << name << "' is two arguments' operation. General syntax:\n";
+ std::cout << " kmc_tools " << name << " <input1 [input1_params]> <input2 [input2_params]> <output [output_params]>\n";
+ std::cout << " input1, input2 - paths to databases generated by KMC \n";
+ std::cout << " output - path to output database\n";
+ std::cout << " For each input there are additional parameters:\n";
+ std::cout << " -ci<value> - exclude k-mers occurring less than <value> times \n";
+ std::cout << " -cx<value> - exclude k-mers occurring more of than <value> times\n";
+ std::cout << " For output there are additional parameters:\n";
+ std::cout << " -ci<value> - exclude k-mers occurring less than <value> times \n";
+ std::cout << " -cx<value> - exclude k-mers occurring more of than <value> times\n";
+ std::cout << " -cs<value> - maximal value of a counter\n";
+ }
+
+ void Display1ArgGeneral(bool output_params) const
+ {
+ std::cout << " The '" << name << "' is one argument operation. General syntax:\n";
+ std::cout << " kmc_tools " << name << " <input> [input_params] <output> "<< (output_params ? "[output_params]" : "") << "\n";
+ std::cout << " input - path to database generated by KMC \n";
+ std::cout << " For input there are additional parameters:\n";
+ std::cout << " -ci<value> - exclude k-mers occurring less than <value> times \n";
+ std::cout << " -cx<value> - exclude k-mers occurring more of than <value> times\n";
+ }
+public:
+ virtual void Display() const = 0;
+ virtual ~CUsageDisplayer() {}
+};
+
+class CGeneralUsageDisplayer : public CUsageDisplayer
+{
+public:
+ CGeneralUsageDisplayer() :CUsageDisplayer("")
+ {}
+ void Display() const override
+ {
+ std::cout << "kmc_tools ver. " << KMC_VER << " (" << KMC_DATE << ")\n";
+ std::cout << "Usage:\n kmc_tools [global parameters] <operation> [operation parameters]\n";
+ std::cout << "Available operations:\n";
+ std::cout << " k-mers sets' operations for 2 KMC's databases:\n";
+ std::cout << " intersect - intersection of 2 k-mers' sets\n";
+ std::cout << " kmers_subtract - subtraction of 2 k-mers' sets\n";
+ std::cout << " counters_subtract - counters' subtraction of 2 k-mers' sets\n";
+ std::cout << " union - union of 2 k-mers' sets\n\n";
+ std::cout << " operations for single kmc database:\n";
+ std::cout << " sort - sorts k-mers from database generated by KMC2.x\n";
+ std::cout << " reduce - exclude too rare and too frequent k-mers\n";
+ std::cout << " compact - remove counters (store only k-mers)\n";
+ std::cout << " histogram - histogram of k-mers occurences\n";
+ std::cout << " dump - dump k-mers and counters to text file\n";
+ std::cout << " more complex operations:\n";
+ std::cout << " complex - complex operations with a number of input databases\n";
+ std::cout << " other operatations:\n";
+ std::cout << " filter - filter out reads with too small number of k-mers\n";
+ std::cout << " global parameters:\n";
+ std::cout << " -t<value> - total number of threads (default: no. of CPU cores)\n";
+ std::cout << " -v - enable verbose mode (shows some information) (default: false)\n";
+ std::cout << " -hp - hide percentage progress (default: false)\n";
+ std::cout << "Example:\n";
+ std::cout << "kmc_tools union db1 -ci3 db2 -ci5 -cx300 db1_union_db2 -ci10\n";
+ std::cout << "For detailed help of concrete operation type operation name without parameters:\n";
+ std::cout << "kmc_tools union\n";
+ }
+};
+
+class CUnionUsageDisplayer : public CUsageDisplayer
+{
+ public:
+ CUnionUsageDisplayer() :CUsageDisplayer("union")
+ {}
+ void Display() const override
+ {
+ Display2ArgGeneral();
+ std::cout << "The output database will contains each k-mer present in both input sets. For the same k-mers in first and second input the counter in output is equal to sum from inputs.";
+ std::cout << "Example:\n";
+ std::cout << "kmc - k28 file1.fastq kmers1 tmp\n";
+ std::cout << "kmc - k28 file2.fastq kmers2 tmp\n";
+ std::cout << "kmc_tools union kmers1 -ci3 -cx70000 kmers2 kmers1_kmers2_union -cs65536\n";
+ }
+};
+
+class CIntersectUsageDisplayer : public CUsageDisplayer
+{
+public:
+ CIntersectUsageDisplayer() :CUsageDisplayer("intersect")
+ {}
+ void Display() const override
+ {
+ Display2ArgGeneral();
+ std::cout << "The output database will contains only k-mers that are present in both input sets. The counter value in output database is equal to lower counter value in input.";
+ std::cout << "Example:\n";
+ std::cout << "kmc - k28 file1.fastq kmers1 tmp\n";
+ std::cout << "kmc - k28 file2.fastq kmers2 tmp\n";
+ std::cout << "kmc_tools intersect kmers1 -ci10 -cx200 kmers2 -ci4 -cx100 kmers1_kmers2_intersect -ci20 -cx150\n";
+ }
+};
+
+
+class CCountersSubtractUsageDisplayer : public CUsageDisplayer
+{
+public:
+ CCountersSubtractUsageDisplayer() :CUsageDisplayer("counters_subtract")
+ {}
+ void Display() const override
+ {
+ Display2ArgGeneral();
+ std::cout << "The output database will contains only k-mers that are present in first input set and have counters higher than apropriate k - mers in second set. For each k - mer the counter is equal to difference between counter in first set and counter in second set.";
+ std::cout << "Example:\n";
+ std::cout << "kmc -k28 file1.fastq kmers1 tmp\n";
+ std::cout << "kmc -k28 file2.fastq kmers2 tmp\n";
+ std::cout << "kmc_tools counters_subtract kmers1 kmers2 kmers1_kmers2_counters_subtract\n";
+ }
+};
+
+class CKmersSubtractUsageDisplayer : public CUsageDisplayer
+{
+public:
+ CKmersSubtractUsageDisplayer() :CUsageDisplayer("kmers_subtract")
+ {}
+ void Display() const override
+ {
+ Display2ArgGeneral();
+ std::cout << "The output database will contains only k-mers that are present in first input set but absent in the second one. The counter value is equal to value from first input set.";
+ std::cout << "Example:\n";
+ std::cout << "kmc - k28 file1.fastq kmers1 tmp\n";
+ std::cout << "kmc - k28 file2.fastq kmers2 tmp\n";
+ std::cout << "kmc_tools kmers_subtract kmers1 kmers2 kmers1_kmers2_subtract - cs200\n";
+ }
+};
+
+class CComplexUsageDisplayer : public CUsageDisplayer
+{
+public:
+ CComplexUsageDisplayer() :CUsageDisplayer("complex")
+ {}
+ void Display() const override
+ {
+ std::cout << "Complex operation allows to define operations for more than 2 input k-mers sets. Command-line syntax:\n";
+ std::cout << "kmc_tools complex <operations_definition_file>\n";
+ std::cout << " operations_definition_file - path to file which define input sets and operations. It is text file with following syntax:\n";
+ std::cout << " __________________________________________________________________ \n";
+ std::cout << "|INPUT: |\n";
+ std::cout << "|<input1>=<input1_db_path> [params] |\n";
+ std::cout << "|<input2>=<input2_db_path> [params] |\n";
+ std::cout << "|... |\n";
+ std::cout << "|<inputN>=<inputN_db_path> [params] |\n";
+ std::cout << "|OUTPUT: |\n";
+ std::cout << "|<out_db_path>=<ref_input><oper><ref_input>[<oper><ref_input>[...] |\n";
+ std::cout << "|[OUTPUT_PARAMS: __|\n";
+ std::cout << "|<output_params>] | /\n";
+ std::cout << "| | / \n";
+ std::cout << "|_______________________________________________________________|/ \n";
+ std::cout << "input1, input2, ..., inputN - names of inputs used to define equasion\n";
+ std::cout << "input1_db_path, input2_db_path, ..., inputN_db_path - paths to k-mers sets\n";
+ std::cout << "For each input there are additional parameters which can be set:\n";
+ std::cout << " -ci<value> - exclude k-mers occurring less than <value> times \n";
+ std::cout << " -cx<value> - exclude k-mers occurring more of than <value> times\n";
+ std::cout << "out_db_path - path to output database\n";
+ std::cout << "ref_input - one of input1, input2, ..., inputN\n";
+ std::cout << "oper - one of {*,-,~,+}, which refers to {intersect, kmers_subtract, counters_subtract, union}\n";
+ std::cout << "operator * has the highest priority. Other operators has equals priorities. Order of operations can be changed with barenthesis\n";
+ std::cout << "output_params are:\n";
+ std::cout << " -ci<value> - exclude k-mers occurring less than <value> times \n";
+ std::cout << " -cx<value> - exclude k-mers occurring more of than <value> times\n";
+ std::cout << " -cs<value> - maximal value of a counter\n";
+
+ std::cout << "Example:\n";
+ std::cout << " __________________________________________________________________ \n";
+ std::cout << "|INPUT: |\n";
+ std::cout << "|set1 = kmc_o1 -ci5 |\n";
+ std::cout << "|set2 = kmc_o2 |\n";
+ std::cout << "|set3 = kmc_o3 -ci10 -cx100 __|\n";
+ std::cout << "|OUTPUT: | /\n";
+ std::cout << "|result = (set3+set1)*set2 | / \n";
+ std::cout << "|_______________________________________________________________|/ \n";
+
+ }
+};
+
+class CSortUsageDisplayer : public CUsageDisplayer
+{
+public:
+ CSortUsageDisplayer() :CUsageDisplayer("sort")
+ {}
+ void Display() const override
+ {
+ Display1ArgGeneral(true);
+ std::cout << " For output there are additional parameters:\n";
+ std::cout << " -cs<value> - maximal value of a counter\n";
+ std::cout << "Converts database produced by KMC2.x to KMC1.x database format (which contains k-mers in sorted order)\n";
+ std::cout << "Example:\n";
+ std::cout << "kmc_tools sort wy_kmc2 -ci3 -cx1000 wy_kmc1 -cs255\n";
+ }
+};
+
+class CReduceUsageDisplayer : public CUsageDisplayer
+{
+public:
+ CReduceUsageDisplayer() :CUsageDisplayer("reduce")
+ {}
+ void Display() const override
+ {
+ Display1ArgGeneral(true);
+ std::cout << " For output there are additional parameters:\n";
+ std::cout << " -cs<value> - maximal value of a counter\n";
+ std::cout << "Exclude too rare and too frequent k-mers\n";
+ std::cout << "Example:\n";
+ std::cout << "kmc_tools reduce wy_kmc2 -ci3 -cx1000 wy_kmc1 -cs255\n";
+ }
+};
+
+
+class CCompactUsageDisplayer : public CUsageDisplayer
+{
+public:
+ CCompactUsageDisplayer() :CUsageDisplayer("compact")
+ {}
+ void Display() const override
+ {
+ Display1ArgGeneral(false);
+ std::cout << "Remove counters of k-mers\n";
+ std::cout << "Example:\n";
+ std::cout << "kmc_tools compact wy_kmc2 -ci3 -cx1000 wy_kmc1\n";
+ }
+};
+
+class CHistogramUsageDisplayer : public CUsageDisplayer
+{
+public:
+ CHistogramUsageDisplayer() :CUsageDisplayer("histogram")
+ {}
+ void Display() const override
+ {
+ Display1ArgGeneral(false);
+ std::cout << "Produce histogram of k-mers occurrences\n";
+ std::cout << "Example:\n";
+ std::cout << "kmc_tools histogram wy_kmc2 -ci3 -cx1000 histo.txt\n";
+ }
+};
+
+
+class CDumpUsageDisplayer : public CUsageDisplayer
+{
+public:
+ CDumpUsageDisplayer() :CUsageDisplayer("dump")
+ {}
+ void Display() const override
+ {
+ std::cout << " The '" << name << "' is one argument operation. General syntax:\n";
+ std::cout << " kmc_tools " << name << " [dump_params] <input> [input_params] <output>\n";
+ std::cout << " dump_params:\n";
+ std::cout << " -s - sorted output\n";
+ std::cout << " input - path to database generated by KMC \n";
+ std::cout << " For input there are additional parameters:\n";
+ std::cout << " -ci<value> - exclude k-mers occurring less than <value> times \n";
+ std::cout << " -cx<value> - exclude k-mers occurring more of than <value> times\n";
+
+
+ std::cout << "Produce text dump of kmc database\n";
+ std::cout << "Example:\n";
+ std::cout << "kmc_tools dump wy_kmc2 -ci3 -cx1000 dump.txt\n";
+ }
+};
+
+class CFilterUsageDisplayer : public CUsageDisplayer
+{
+public:
+ CFilterUsageDisplayer() : CUsageDisplayer("filter")
+ {}
+ void Display() const override
+ {
+ std::cout << " The '" << name << "' is two arguments' operation. General syntax:\n";
+ std::cout << " kmc_tools " << name << " <kmc_input_db> [kmc_input_db_params] <input_read_set> [input_read_set_params] <output_read_set> [output_read_set_params]\n";
+ std::cout << " kmc_input_db - path to database generated by KMC \n";
+ std::cout << " input_read_set - path to input set of reads \n";
+ std::cout << " output_read_set - path to set output of reads \n";
+ std::cout << " For k-mers' database there are additional parameters:\n";
+ std::cout << " -ci<value> - exclude k-mers occurring less than <value> times \n";
+ std::cout << " -cx<value> - exclude k-mers occurring more of than <value> times\n";
+ std::cout << " For input set of reads there are additional parameters:\n";
+ std::cout << " -ci<value> - remove reads containing less k-mers than value. It can be integer or floating number in range [0.0;1.0]\n";
+ std::cout << " -ci<value> - remove reads containing more k-mers than value. It can be integer or floating number in range [0.0;1.0]\n";
+ std::cout << " -f<a/q> - input in FASTA format (-fa), FASTQ format (-fq); default: FASTQ\n";
+ std::cout << " For output set of reads there are additional parameters:\n";
+ std::cout << " -f<a/q> - output in FASTA format (-fa), FASTQ format (-fq); default: same as input\n";
+ std::cout << "Example:\n";
+ std::cout << "kmc_tools filter kmc_db -ci3 input.fastq -ci0.5 -cx1.0 filtered.fastq\n";
+ std::cout << "kmc_tools filter kmc_db input.fastq -ci10 -cx100 filtered.fastq\n";
+ }
+};
+
+class CUsageDisplayerFactory
+{
+ std::unique_ptr<CUsageDisplayer> desc;
+public:
+ CUsageDisplayerFactory(CConfig::Mode mode)
+ {
+ switch (mode)
+ {
+ case CConfig::Mode::UNDEFINED:
+ desc = std::make_unique<CGeneralUsageDisplayer>();
+ break;
+ case CConfig::Mode::INTERSECTION:
+ desc = std::make_unique<CIntersectUsageDisplayer>();
+ break;
+ case CConfig::Mode::KMERS_SUBTRACT:
+ desc = std::make_unique<CKmersSubtractUsageDisplayer>();
+ break;
+ case CConfig::Mode::COUNTERS_SUBTRACT:
+ desc = std::make_unique<CCountersSubtractUsageDisplayer>();
+ break;
+ case CConfig::Mode::UNION:
+ desc = std::make_unique<CUnionUsageDisplayer>();
+ break;
+ case CConfig::Mode::COMPLEX:
+ desc = std::make_unique<CComplexUsageDisplayer>();
+ break;
+ case CConfig::Mode::SORT:
+ desc = std::make_unique<CSortUsageDisplayer>();
+ break;
+ case CConfig::Mode::REDUCE:
+ desc = std::make_unique<CReduceUsageDisplayer>();
+ break;
+ case CConfig::Mode::COMPACT:
+ desc = std::make_unique<CCompactUsageDisplayer>();
+ break;
+ case CConfig::Mode::HISTOGRAM:
+ desc = std::make_unique<CHistogramUsageDisplayer>();
+ break;
+ case CConfig::Mode::DUMP:
+ desc = std::make_unique<CDumpUsageDisplayer>();
+ break;
+ case CConfig::Mode::COMPARE:
+ desc = std::make_unique<CGeneralUsageDisplayer>();
+ break;
+ case CConfig::Mode::FILTER:
+ desc = std::make_unique<CFilterUsageDisplayer>();
+ break;
+ default:
+ desc = std::make_unique<CGeneralUsageDisplayer>();
+ break;
+ }
+ }
+ const CUsageDisplayer& GetUsageDisplayer()
+ {
+ return *desc;
+ }
+};
+
+#endif
+
+
+// ***** EOF
\ No newline at end of file
diff --git a/kmc_tools/defs.h b/kmc_tools/defs.h
new file mode 100644
index 0000000..122bd97
--- /dev/null
+++ b/kmc_tools/defs.h
@@ -0,0 +1,92 @@
+/*
+ This file is a part of KMC software distributed under GNU GPL 3 licence.
+ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+ Authors: Marek Kokot
+
+ Version: 2.3.0
+ Date : 2015-08-21
+*/
+
+#ifndef _DEFS_H
+#define _DEFS_H
+
+#include <stdio.h>
+#include <stdlib.h>
+
+using uint32 = unsigned int;
+using uint64 = unsigned long long;
+using int32 = int;
+using int64 = long long;
+using uchar = unsigned char;
+
+#define MIN(x,y) ((x) < (y) ? (x) : (y))
+#define MAX(x,y) ((x) > (y) ? (x) : (y))
+#define NORM(x, lower, upper) ((x) < (lower) ? (lower) : (x) > (upper) ? (upper) : (x))
+
+#define BYTE_LOG(x) (((x) < (1 << 8)) ? 1 : ((x) < (1 << 16)) ? 2 : ((x) < (1 << 24)) ? 3 : 4)
+
+
+//#define DISABLE_ASMLIB
+
+//#define ENABLE_DEBUG
+//#define ENABLE_LOGGER
+
+#define KMC_VER "2.3.0"
+#define KMC_DATE "2015-08-21"
+
+
+
+
+#define DEFAULT_CIRCULAL_QUEUE_CAPACITY (4)
+
+#define SUFIX_WRITE_QUEUE_CAPACITY (10)
+
+
+#define KMC1_DB_READER_PREFIX_BUFF_BYTES (1 << 24)
+#define KMC1_DB_READER_SUFIX_BUFF_BYTES (1 << 24)
+
+#define KMC2_DB_READER_PREFIX_BUFF_BYTES (1 << 24)
+#define KMC2_DB_READER_SUFIX_BUFF_BYTES (1 << 24)
+
+#define KMC1_DB_WRITER_PREFIX_BUFF_BYTES (1 << 24)
+#define KMC1_DB_WRITER_SUFIX_BUFF_BYTES (1 << 24)
+
+#define HISTOGRAM_MAX_COUNTER_DEFAULT 10000
+
+#define DUMP_BUF_SIZE (1 << 24)
+
+//Increasing this value will lead to more memory consumption, but from preliminary observations it has no performance(is sense of time) impact, so it is recommended to not change this value
+#define BUNDLE_CAPACITY (1 << 12) //in kmers, for kmers and counters.
+
+//this value has high impact to used memory, max value of memory is = 2 * SINGLE_BIN_BUFF_SIZE_FOR_DB2_READER * number_of_kmc2_input_dbs * number_of_bins_per_in_db
+//increasing this value can have positive performance impact when running on HDD
+#define SINGLE_BIN_BUFF_SIZE_FOR_DB2_READER (1 << 21) //if less is needed less will be allocated
+
+
+//default values
+#define CUTOFF_MIN 2
+#define CUTOFF_MAX 1000000000
+#define COUNTER_MAX 255
+
+#define MAX_K 256
+#define KMER_WORDS ((MAX_K + 31) / 32)
+
+
+#define USE_META_PROG
+
+#ifdef WIN32
+#define my_fopen fopen
+#define my_fseek _fseeki64
+#define my_ftell _ftelli64
+
+#else
+#define my_fopen fopen
+#define my_fseek fseek
+#define my_ftell ftell
+#endif
+
+#endif
+
+
+// ***** EOF
\ No newline at end of file
diff --git a/kmc_tools/dump_writer.h b/kmc_tools/dump_writer.h
new file mode 100644
index 0000000..3c05b85
--- /dev/null
+++ b/kmc_tools/dump_writer.h
@@ -0,0 +1,174 @@
+/*
+ This file is a part of KMC software distributed under GNU GPL 3 licence.
+ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+ Authors: Marek Kokot
+
+ Version: 2.3.0
+ Date : 2015-08-21
+*/
+
+#ifndef _DUMP_WRITER_H
+#define _DUMP_WRITER_H
+#include "defs.h"
+#include "kmer.h"
+#include "nc_utils.h"
+#include "config.h"
+#include <fstream>
+
+
+//wrapper to simplify interface
+
+//For kmc1 input and kmc2 input without -s parameter
+template<typename KMCDB, unsigned SIZE, bool SORTED>
+class CKMCDBForDump
+{
+ KMCDB kmcdb;
+public:
+ CKMCDBForDump() :
+ kmcdb(CConfig::GetInstance().headers.front(), CConfig::GetInstance().input_desc.front(), CConfig::GetInstance().percent_progress, KMCDBOpenMode::sequential){}
+ bool NextKmer(CKmer<SIZE>& kmer, uint32& counter)
+ {
+ return kmcdb.NextKmerSequential(kmer, counter);
+ }
+};
+
+
+//specialization for -s parameter nad kmc2 input
+template<unsigned SIZE>
+class CKMCDBForDump<CKMC2DbReader<SIZE>, SIZE, true>
+{
+ CKMC2DbReader<SIZE>* kmcdb;
+ CBundle<SIZE> bundle;
+public:
+ CKMCDBForDump() :
+ kmcdb(new CKMC2DbReader<SIZE>(CConfig::GetInstance().headers.front(), CConfig::GetInstance().input_desc.front(), CConfig::GetInstance().percent_progress, KMCDBOpenMode::sorted)),
+ bundle(kmcdb){}
+ bool NextKmer(CKmer<SIZE>& kmer, uint32& counter)
+ {
+ if(!bundle.Finished())
+ {
+ kmer = bundle.TopKmer();
+ counter = bundle.TopCounter();
+ bundle.Pop();
+ return true;
+ }
+ return false;
+ }
+};
+
+template<typename KMCDB, unsigned SIZE>
+class CDumpWriter
+{
+ static const uint32 OVERHEAD_SIZE = 1000;
+ KMCDB& kmcdb;
+ COutputDesc output_desc;
+ uint32 kmer_len;
+ uint32 kmer_bytes;
+ CConfig& config;
+ uint32 in_first_byte;
+ char* buf;
+ uint32 buf_size;
+ uint32 buf_pos;
+ struct DumpOpt
+ {
+ char* opt_ACGT;
+ DumpOpt()
+ {
+ opt_ACGT = new char[1024];
+ char codes[] = { 'A', 'C', 'G', 'T' };
+ uint32 pos = 0;
+ for (uint32 kmer = 0; kmer < 256; ++kmer)
+ {
+ opt_ACGT[pos++] = codes[(kmer >> 6) & 3];
+ opt_ACGT[pos++] = codes[(kmer >> 4) & 3];
+ opt_ACGT[pos++] = codes[(kmer >> 2) & 3];
+ opt_ACGT[pos++] = codes[kmer & 3];
+ }
+
+ }
+ ~DumpOpt()
+ {
+ delete[]opt_ACGT;
+ }
+
+ }opt;
+
+ void kmerToStr(CKmer<SIZE>& kmer, char* kmer_str)
+ {
+ //first byte
+ char* base = opt.opt_ACGT + 4 * kmer.get_byte(kmer_bytes - 1) + 4 - in_first_byte;
+ for(uint32 i = 0 ; i < in_first_byte ; ++i)
+ *kmer_str++ = *base++;
+ //rest
+ for (int pos = kmer_bytes - 2; pos >= 0; --pos)
+ {
+ base = opt.opt_ACGT + 4 * kmer.get_byte(pos);
+ *kmer_str++ = *base++;
+ *kmer_str++ = *base++;
+ *kmer_str++ = *base++;
+ *kmer_str++ = *base++;
+ }
+ }
+
+public:
+ CDumpWriter(KMCDB& kmcdb) :kmcdb(kmcdb), output_desc(CConfig::GetInstance().output_desc), config(CConfig::GetInstance())
+ {
+ kmer_len = config.headers.front().kmer_len;
+ kmer_bytes = (kmer_len + 3) / 4;
+ in_first_byte = kmer_len % 4;
+ if (in_first_byte == 0)
+ in_first_byte = 4;
+ }
+
+ bool Process()
+ {
+ CKmer<SIZE> kmer;
+ uint32 counter;
+
+ uint32 counter_len;
+ FILE* file = fopen(output_desc.file_src.c_str(), "wb");
+ if (!file)
+ {
+ std::cout << "Error: cannot open file: " << output_desc.file_src << "\n";
+ exit(1);
+ }
+ buf_pos = 0;
+ buf_size = DUMP_BUF_SIZE;
+ buf = new char[buf_size];
+
+ //while (kmcdb.NextKmerSequential(kmer, counter))
+ while (kmcdb.NextKmer(kmer, counter))
+ {
+ if (counter >= output_desc.cutoff_min && counter <= output_desc.cutoff_max)
+ {
+ kmerToStr(kmer, buf + buf_pos);
+ buf[buf_pos + kmer_len] = '\t';
+ counter_len = CNumericConversions::Int2PChar(counter, (uchar*)(buf + buf_pos + kmer_len + 1));
+ buf[buf_pos + kmer_len + 1 + counter_len] = '\n';
+ buf_pos += kmer_len + 2 + counter_len;
+ if (buf_pos + OVERHEAD_SIZE > buf_size)
+ {
+ fwrite(buf, 1, buf_pos, file);
+ buf_pos = 0;
+ }
+
+ }
+ }
+
+ //save rest if necessary
+ if (buf_pos)
+ {
+ fwrite(buf, 1, buf_pos, file);
+ buf_pos = 0;
+ }
+
+ fclose(file);
+
+ delete[] buf;
+
+ return true;
+ }
+};
+
+#endif
\ No newline at end of file
diff --git a/kmc_tools/expression_node.h b/kmc_tools/expression_node.h
new file mode 100644
index 0000000..91e06a9
--- /dev/null
+++ b/kmc_tools/expression_node.h
@@ -0,0 +1,198 @@
+/*
+ This file is a part of KMC software distributed under GNU GPL 3 licence.
+ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+ Authors: Marek Kokot
+
+ Version: 2.3.0
+ Date : 2015-08-21
+*/
+
+#ifndef _EXPRESSION_NODE_H
+#define _EXPRESSION_NODE_H
+#include "defs.h"
+#include "operations.h"
+#include <iostream>
+#include <string>
+#include <sstream>
+#include <memory>
+#include "kmc1_db_reader.h"
+#include "kmc2_db_reader.h"
+
+//************************************************************************************************************
+// CExpressionNode - Base abstract class representing expression node. In first stage of algorithm from
+// user input there is created binary tree. Node type represents operation. This tree is only for generating
+// another tree (check out CInput and CBundle)
+//************************************************************************************************************
+template<unsigned SIZE> class CExpressionNode
+{
+public:
+ CExpressionNode() :left(nullptr), right(nullptr)
+ {
+
+ }
+ CExpressionNode* GetLeftChild() const
+ {
+ return left;
+ }
+ CExpressionNode* GetRightChild() const
+ {
+ return right;
+ }
+
+ virtual CBundle<SIZE>* GetExecutionRoot() = 0;
+
+ void AddLeftChild(CExpressionNode* child)
+ {
+#ifdef ENABLE_DEBUG
+ if (left)
+ {
+ std::cout << "This child node already exists\n";
+ exit(1);
+ }
+#endif
+ left = child;
+ }
+
+ void AddRightChild(CExpressionNode* child)
+ {
+#ifdef ENABLE_DEBUG
+ if (right)
+ {
+ std::cout << "This child node already exists\n";
+ exit(1);
+ }
+#endif
+ right = child;
+ }
+
+#ifdef ENABLE_DEBUG
+ virtual void Info() = 0;
+ void Display(int adient = 0)
+ {
+ if (right)
+ right->Display(adient + 5);
+
+ for (int i = 0; i < adient; ++i)
+ std::cout << " ";
+ Info();
+ std::cout << "\n";
+ if (left)
+ left->Display(adient + 5);
+ }
+#endif
+
+ virtual ~CExpressionNode()
+ {
+ delete left;
+ delete right;
+ }
+
+protected:
+ CExpressionNode* left, *right;
+};
+
+//************************************************************************************************************
+// CExpressionNode - represents node for union operation
+//************************************************************************************************************
+template<unsigned SIZE> class CUnionNode : public CExpressionNode<SIZE>
+{
+public:
+ CBundle<SIZE>* GetExecutionRoot() override
+ {
+ return new CBundle<SIZE>(new CUnion<SIZE>(this->left->GetExecutionRoot(), this->right->GetExecutionRoot()));
+ }
+#ifdef ENABLE_DEBUG
+ void Info() override
+ {
+ std::cout << "+";
+ }
+#endif
+};
+
+//************************************************************************************************************
+// CKmersSubtractionNode - represents node for subtraction of k-mers (if k-mer exists in both input,
+// it is absent in result) operation
+//************************************************************************************************************
+template<unsigned SIZE> class CKmersSubtractionNode : public CExpressionNode<SIZE>
+{
+public:
+ CBundle<SIZE>* GetExecutionRoot() override
+ {
+ return new CBundle<SIZE>(new CKmersSubtract<SIZE>(this->left->GetExecutionRoot(), this->right->GetExecutionRoot()));
+ }
+#ifdef ENABLE_DEBUG
+ void Info() override
+ {
+ std::cout << "-";
+ }
+#endif
+};
+
+
+template<unsigned SIZE> class CCountersSubtractionNode : public CExpressionNode<SIZE>
+{
+public:
+ CBundle<SIZE>* GetExecutionRoot() override
+ {
+ return new CBundle<SIZE>(new CCountersSubtract<SIZE>(this->left->GetExecutionRoot(), this->right->GetExecutionRoot()));
+ }
+#ifdef ENABLE_DEBUG
+ void Info() override
+ {
+ std::cout << "~";
+ }
+#endif
+};
+//************************************************************************************************************
+// CIntersectionNode - represents node for intersection operation
+//************************************************************************************************************
+template<unsigned SIZE> class CIntersectionNode : public CExpressionNode<SIZE>
+{
+public:
+ CBundle<SIZE>* GetExecutionRoot() override
+ {
+ return new CBundle<SIZE>(new CIntersection<SIZE>(this->left->GetExecutionRoot(), this->right->GetExecutionRoot()));
+ }
+#ifdef ENABLE_DEBUG
+ void Info() override
+ {
+ std::cout << "*";
+ }
+#endif
+};
+
+//************************************************************************************************************
+// CInputNode - represents node (leaf) - KMC1 or KMC2 database
+//************************************************************************************************************
+template<unsigned SIZE> class CInputNode : public CExpressionNode<SIZE>
+{
+ uint32 desc_pos;
+public:
+ CInputNode(uint32 desc_pos) : desc_pos(desc_pos)
+ {
+ }
+ CBundle<SIZE>* GetExecutionRoot() override
+ {
+ CConfig& config = CConfig::GetInstance();
+ CInput<SIZE>* db = nullptr;
+ if (!config.headers[desc_pos].IsKMC2())
+ db = new CKMC1DbReader<SIZE>(config.headers[desc_pos], config.input_desc[desc_pos], CConfig::GetInstance().percent_progress, KMCDBOpenMode::sorted);
+ else
+ db = new CKMC2DbReader<SIZE>(config.headers[desc_pos], config.input_desc[desc_pos], CConfig::GetInstance().percent_progress, KMCDBOpenMode::sorted);
+ return new CBundle<SIZE>(db);
+ }
+
+#ifdef ENABLE_DEBUG
+ void Info() override
+ {
+ std::cout << "In: " << CConfig::GetInstance().input_desc[desc_pos].file_src;
+ }
+#endif
+};
+
+
+#endif
+
+
+// ***** EOF
\ No newline at end of file
diff --git a/kmc_tools/fastq_filter.cpp b/kmc_tools/fastq_filter.cpp
new file mode 100644
index 0000000..0e5fad9
--- /dev/null
+++ b/kmc_tools/fastq_filter.cpp
@@ -0,0 +1,364 @@
+/*
+ This file is a part of KMC software distributed under GNU GPL 3 licence.
+ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+ Authors: Marek Kokot
+
+ Version: 2.3.0
+ Date : 2015-08-21
+*/
+
+#include "stdafx.h"
+#include "fastq_filter.h"
+#include "asmlib_wrapper.h"
+#include <numeric>
+
+using namespace std;
+
+/*****************************************************************************************************************************/
+/******************************************************** CONSTRUCTOR ********************************************************/
+/*****************************************************************************************************************************/
+
+/*****************************************************************************************************************************/
+CFastqFilter::CFastqFilter(CFilteringParams& Params, CFilteringQueues& Queues, CKMCFile& kmc_api) :
+ kmc_api(kmc_api)
+{
+ input_part_queue = Queues.input_part_queue;
+ filtered_part_queue = Queues.filtered_part_queue;
+ pmm_fastq_reader = Queues.pmm_fastq_reader;
+ pmm_fastq_filter = Queues.pmm_fastq_filter;
+ input_file_type = Params.input_file_type;
+ output_file_type = Params.output_file_type;
+ use_float_value = Params.use_float_value;
+ f_max_kmers = Params.f_max_kmers;
+ f_min_kmers = Params.f_min_kmers;
+ n_max_kmers = Params.n_max_kmers;
+ n_min_kmers = Params.n_min_kmers;
+ kmer_len = Params.kmer_len;
+ output_part_size = Params.mem_part_pmm_fastq_reader;
+}
+
+/*****************************************************************************************************************************/
+CWFastqFilter::CWFastqFilter(CFilteringParams& Params, CFilteringQueues& Queues, CKMCFile& kmc_api)
+{
+ ff = make_unique<CFastqFilter>(Params, Queues, kmc_api);
+}
+
+
+/*****************************************************************************************************************************/
+/********************************************************** PUBLIC ***********************************************************/
+/*****************************************************************************************************************************/
+
+/*****************************************************************************************************************************/
+void CFastqFilter::Process()
+{
+ if (input_file_type == CFilteringParams::file_type::fastq && output_file_type == CFilteringParams::file_type::fastq)
+ ProcessFastqToFastq();
+ else if (input_file_type == CFilteringParams::file_type::fastq && output_file_type == CFilteringParams::file_type::fasta)
+ ProcessFastqToFasta();
+ else if (input_file_type == CFilteringParams::file_type::fasta && output_file_type == CFilteringParams::file_type::fasta)
+ ProcessFastaToFasta();
+ else
+ {
+ cout << "Error: this file type is not supported by filter operation\n";
+ exit(1);
+ }
+}
+
+/*****************************************************************************************************************************/
+void CWFastqFilter::operator()()
+{
+ ff->Process();
+}
+
+/*****************************************************************************************************************************/
+/********************************************************** PRIVATE **********************************************************/
+/*****************************************************************************************************************************/
+
+/*****************************************************************************************************************************/
+bool CFastqFilter::FilterRead()
+{
+ uint32 read_len = static_cast<uint32>(seq_desc.read_end - seq_desc.read_start);
+ read.assign((char*)input_part + seq_desc.read_start, read_len);
+
+ kmc_api.GetCountersForRead(read, counters);
+ uint32 valid_kmers = 0;
+ for(auto counter : counters)
+ if (counter)
+ ++valid_kmers;
+
+ if (use_float_value)
+ {
+ uint32 min = static_cast<uint32>(f_min_kmers * (read_len - kmer_len + 1));
+ uint32 max = static_cast<uint32>(f_max_kmers * (read_len - kmer_len + 1));
+ if (valid_kmers >= min && valid_kmers <= max)
+ return true;
+ return false;
+ }
+ else
+ {
+ if (valid_kmers >= n_min_kmers && valid_kmers <= n_max_kmers)
+ return true;
+ return false;
+ }
+}
+
+/*****************************************************************************************************************************/
+bool CFastqFilter::NextSeqFasta()
+{
+ // Title
+ char c;
+ if (input_part_pos >= input_part_size)
+ return false;
+ c = input_part[input_part_pos++];
+ if (c != '>')
+ return false;
+
+ seq_desc.read_header_start = input_part_pos - 1;
+
+ for (; input_part_pos < input_part_size;)
+ {
+ c = input_part[input_part_pos++];
+ if (c < 32) // newliners
+ break;
+ }
+ seq_desc.read_header_end = input_part_pos - 1;
+
+ if (input_part_pos >= input_part_size)
+ return false;
+
+ c = input_part[input_part_pos++];
+ if (c >= 32)
+ input_part_pos--;
+ else if (input_part_pos >= input_part_size)
+ return false;
+
+ seq_desc.read_start = input_part_pos;
+ // Sequence
+ for (; input_part_pos < input_part_size;)
+ {
+ c = input_part[input_part_pos++];
+ if (c < 32) // newliners
+ break;
+ }
+ seq_desc.read_end = input_part_pos - 1;
+
+ seq_desc.end = input_part_pos;
+
+ if (input_part_pos >= input_part_size)
+ return true;
+
+ seq_desc.end++;
+ if (input_part[input_part_pos++] >= 32)
+ {
+ input_part_pos--;
+ seq_desc.end--;
+ }
+
+ else if (input_part_pos >= input_part_size)
+ return true;
+
+ return (c == '\n' || c == '\r');
+}
+
+/*****************************************************************************************************************************/
+bool CFastqFilter::NextSeqFastq()
+{
+ char c;
+ // Title
+ if (input_part_pos >= input_part_size)
+ return false;
+
+
+ c = input_part[input_part_pos++];
+ if (c != '@')
+ return false;
+
+ seq_desc.read_header_start = input_part_pos - 1;
+
+ for (; input_part_pos < input_part_size;)
+ {
+ c = input_part[input_part_pos++];
+ if (c < 32) // newliners
+ break;
+ }
+ seq_desc.read_header_end = input_part_pos - 1;
+
+ if (input_part_pos >= input_part_size)
+ return false;
+
+ c = input_part[input_part_pos++];
+ if (c >= 32)
+ input_part_pos--;
+ else if (input_part_pos >= input_part_size)
+ return false;
+
+ seq_desc.read_start = input_part_pos;
+ // Sequence
+ for (; input_part_pos < input_part_size;)
+ {
+ c = input_part[input_part_pos++];
+ if (c < 32) // newliners
+ break;
+ }
+ seq_desc.read_end = input_part_pos - 1;
+
+ if (input_part_pos >= input_part_size)
+ return false;
+
+ c = input_part[input_part_pos++];
+ if (c >= 32)
+ input_part_pos--;
+ else if (input_part_pos >= input_part_size)
+ return false;
+
+ // Plus
+ c = input_part[input_part_pos++];
+ if (input_part_pos >= input_part_size)
+ return false;
+ if (c != '+')
+ return false;
+
+ seq_desc.quality_header_start = input_part_pos - 1;
+
+ for (; input_part_pos < input_part_size;)
+ {
+ c = input_part[input_part_pos++];
+ if (c < 32) // newliners
+ break;
+ }
+ seq_desc.quality_header_end = input_part_pos - 1;
+
+ if (input_part_pos >= input_part_size)
+ return false;
+
+ c = input_part[input_part_pos++];
+ if (c >= 32)
+ input_part_pos--;
+ else if (input_part_pos >= input_part_size)
+ return false;
+
+ // Quality
+ seq_desc.quality_start = input_part_pos;
+
+ input_part_pos += seq_desc.read_end - seq_desc.read_start;
+ if (input_part_pos >= input_part_size)
+ return false;
+ c = input_part[input_part_pos++];
+
+ seq_desc.quality_end = input_part_pos - 1;
+
+ seq_desc.end = input_part_pos;
+
+ if (input_part_pos >= input_part_size)
+ return true;
+
+ seq_desc.end++;
+ if (input_part[input_part_pos++] >= 32)
+ {
+ input_part_pos--;
+ seq_desc.end--;
+
+ }
+ else if (input_part_pos >= input_part_size)
+ return true;
+
+ return c == '\n' || c == '\r';
+}
+
+/*****************************************************************************************************************************/
+void CFastqFilter::ProcessFastaToFasta()
+{
+ pmm_fastq_filter->reserve(output_part);
+ output_part_pos = 0;
+ uint64 required_size;
+ while (input_part_queue->pop(input_part, input_part_size))
+ {
+ input_part_pos = 0;
+ while (NextSeqFasta())
+ {
+ if (FilterRead())
+ {
+ required_size = seq_desc.end - seq_desc.read_header_start;
+ if (output_part_pos + required_size > output_part_size)
+ {
+ filtered_part_queue->push(output_part, output_part_pos);
+ pmm_fastq_filter->reserve(output_part);
+ output_part_pos = 0;
+ }
+ A_memcpy(output_part + output_part_pos, input_part + seq_desc.read_header_start, required_size);
+ output_part_pos += required_size;
+ }
+ }
+ pmm_fastq_reader->free(input_part);
+ }
+ filtered_part_queue->push(output_part, output_part_pos);
+ filtered_part_queue->mark_completed();
+}
+
+/*****************************************************************************************************************************/
+void CFastqFilter::ProcessFastqToFastq()
+{
+ pmm_fastq_filter->reserve(output_part);
+ output_part_pos = 0;
+ uint64 required_size;
+ while (input_part_queue->pop(input_part, input_part_size))
+ {
+ input_part_pos = 0;
+ while (NextSeqFastq())
+ {
+ if (FilterRead())
+ {
+ required_size = seq_desc.quality_header_start - seq_desc.read_header_start + 1 + seq_desc.end - seq_desc.quality_header_end;
+ if (output_part_pos + required_size > output_part_size)
+ {
+ filtered_part_queue->push(output_part, output_part_pos);
+ pmm_fastq_filter->reserve(output_part);
+ output_part_pos = 0;
+ }
+ A_memcpy(output_part + output_part_pos, input_part + seq_desc.read_header_start, seq_desc.quality_header_start - seq_desc.read_header_start + 1);
+ output_part_pos += seq_desc.quality_header_start - seq_desc.read_header_start + 1;
+ A_memcpy(output_part + output_part_pos, input_part + seq_desc.quality_header_end, seq_desc.end - seq_desc.quality_header_end);
+ output_part_pos += seq_desc.end - seq_desc.quality_header_end;
+ }
+ }
+ pmm_fastq_reader->free(input_part);
+ }
+ filtered_part_queue->push(output_part, output_part_pos);
+ filtered_part_queue->mark_completed();
+}
+
+/*****************************************************************************************************************************/
+void CFastqFilter::ProcessFastqToFasta()
+{
+ pmm_fastq_filter->reserve(output_part);
+ output_part_pos = 0;
+ uint64 required_size;
+ while (input_part_queue->pop(input_part, input_part_size))
+ {
+ input_part_pos = 0;
+ while (NextSeqFastq())
+ {
+ if (FilterRead())
+ {
+ required_size = seq_desc.quality_header_start - seq_desc.read_header_start;
+ if (output_part_pos + required_size > output_part_size)
+ {
+ filtered_part_queue->push(output_part, output_part_pos);
+ pmm_fastq_filter->reserve(output_part);
+ output_part_pos = 0;
+ }
+ input_part[seq_desc.read_header_start] = '>';
+ A_memcpy(output_part + output_part_pos, input_part + seq_desc.read_header_start, seq_desc.quality_header_start - seq_desc.read_header_start);
+ output_part_pos += seq_desc.quality_header_start - seq_desc.read_header_start;
+ }
+ }
+ pmm_fastq_reader->free(input_part);
+ }
+ filtered_part_queue->push(output_part, output_part_pos);
+ filtered_part_queue->mark_completed();
+}
+
+
+
+// ***** EOF
\ No newline at end of file
diff --git a/kmc_tools/fastq_filter.h b/kmc_tools/fastq_filter.h
new file mode 100644
index 0000000..a6c3332
--- /dev/null
+++ b/kmc_tools/fastq_filter.h
@@ -0,0 +1,83 @@
+/*
+ This file is a part of KMC software distributed under GNU GPL 3 licence.
+ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+ Authors: Marek Kokot
+
+ Version: 2.3.0
+ Date : 2015-08-21
+*/
+
+#ifndef _FASTQ_FILTER_H
+#define _FASTQ_FILTER_H
+
+#include "config.h"
+#include "../kmc_api/kmc_file.h"
+
+//************************************************************************************************************
+// CFastqFilter - filter of reads
+//************************************************************************************************************
+class CFastqFilter
+{
+ CPartQueue *input_part_queue, *filtered_part_queue;
+ CMemoryPool *pmm_fastq_reader;
+ CMemoryPool *pmm_fastq_filter;
+ CFilteringParams::file_type input_file_type, output_file_type;
+ CKMCFile& kmc_api;
+ uint64 output_part_size;
+
+ uchar* input_part;
+ uint64 input_part_size;
+ uint64 input_part_pos;
+ uchar* output_part;
+ uint64 output_part_pos;
+
+ std::vector<uint32> counters;
+ std::string read;
+ struct {
+ uint64 read_header_start;
+ uint64 read_header_end;
+ uint64 read_start;
+ uint64 read_end;
+ uint64 quality_header_start;
+ uint64 quality_header_end;
+ uint64 quality_start;
+ uint64 quality_end;
+ uint64 end;
+ }seq_desc;
+
+ bool use_float_value;
+ float f_max_kmers;
+ float f_min_kmers;
+ uint32 n_max_kmers;
+ uint32 n_min_kmers;
+ uint32 kmer_len;
+
+ void ProcessFastaToFasta();
+ void ProcessFastqToFasta();
+ void ProcessFastqToFastq();
+
+ bool NextSeqFastq();
+ bool NextSeqFasta();
+ bool FilterRead();
+public:
+ CFastqFilter(CFilteringParams& Params, CFilteringQueues& Queues, CKMCFile& kmc_api);
+ void Process();
+
+};
+
+//************************************************************************************************************
+// CWFastqFilter - wrapper for CFastqFilter class - for multithreading purposes
+//************************************************************************************************************
+class CWFastqFilter
+{
+ std::unique_ptr<CFastqFilter> ff;
+public:
+ CWFastqFilter(CFilteringParams& Params, CFilteringQueues& Queues, CKMCFile& kmc_api);
+ void operator()();
+};
+
+
+#endif
+
+// ***** EOF
\ No newline at end of file
diff --git a/kmer_counter/fastq_reader.cpp b/kmc_tools/fastq_reader.cpp
similarity index 65%
copy from kmer_counter/fastq_reader.cpp
copy to kmc_tools/fastq_reader.cpp
index 844a113..9fdbea4 100644
--- a/kmer_counter/fastq_reader.cpp
+++ b/kmc_tools/fastq_reader.cpp
@@ -1,15 +1,18 @@
-#include "stdafx.h"
/*
This file is a part of KMC software distributed under GNU GPL 3 licence.
The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
- Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
+ Authors: Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
+#include "stdafx.h"
+
#include <algorithm>
+#include <cstring>
+
#include "defs.h"
#include "fastq_reader.h"
@@ -23,9 +26,8 @@ uint64 CFastqReader::OVERHEAD_SIZE = 1 << 16;
// Constructor of FASTA/FASTQ reader
// Parameters:
// * _mm - pointer to memory monitor (to check the memory limits)
-CFastqReader::CFastqReader(CMemoryMonitor *_mm, CMemoryPool *_pmm_fastq, input_type _file_type, uint32 _gzip_buffer_size, uint32 _bzip2_buffer_size, int _kmer_len)
+CFastqReader::CFastqReader(CMemoryPool *_pmm_fastq, CFilteringParams::file_type _file_type, uint32 _gzip_buffer_size, uint32 _bzip2_buffer_size, int _kmer_len)
{
- mm = _mm;
pmm_fastq = _pmm_fastq;
file_type = _file_type;
@@ -46,7 +48,7 @@ CFastqReader::CFastqReader(CMemoryMonitor *_mm, CMemoryPool *_pmm_fastq, input_t
gzip_buffer_size = _gzip_buffer_size;
bzip2_buffer_size = _bzip2_buffer_size;
- containsNextChromosome = false;
+
}
//----------------------------------------------------------------------------------
@@ -149,73 +151,14 @@ bool CFastqReader::OpenFiles()
return true;
}
-//----------------------------------------------------------------------------------
-// Read a part of the file in multi line fasta format
-bool CFastqReader::GetPartFromMultilneFasta(uchar *&_part, uint64 &_size)
-{
- uint64 readed = 0;
-
- if(!containsNextChromosome)
- {
- if(IsEof())
- return false;
- }
- if(mode == m_plain)
- readed = fread(part+part_filled, 1, part_size-part_filled, in);
- else if(mode == m_gzip)
- readed = gzread(in_gzip, part+part_filled, (int) (part_size-part_filled));
- else if(mode == m_bzip2)
- readed = BZ2_bzRead(&bzerror, in_bzip2, part+part_filled, (int) (part_size-part_filled));
- int64 total_filled = part_filled + readed;
- int64 last_header_pos = 0;
- int64 pos = 0;
- for(int64 i = 0 ; i < total_filled ;++i )//find last '>' and remove EOLs
- {
- if(part[i] == '>')
- {
- int64 tmp = i;
- SkipNextEOL(part,i,total_filled);
- copy(part+tmp, part+i, part+pos);
- last_header_pos = pos;
- pos += i - tmp;
- }
- if(part[i] != '\n' && part[i] != '\r')
- {
- part[pos++] = part[i];
- }
- }
-
- _part = part;
- if(last_header_pos == 0)//data in block belong to one seq
- {
- part_filled = kmer_len - 1;
- _size = pos;
- pmm_fastq->reserve(part);
- copy(_part+_size-part_filled, _part+_size, part);
- containsNextChromosome = false;
- }
- else//next seq starts at last_header_pos
- {
- _size = last_header_pos;
- part_filled = pos - last_header_pos;
- pmm_fastq->reserve(part);
- copy(_part + last_header_pos, _part + pos, part);
- containsNextChromosome = true;
- }
- return true;
-}
//----------------------------------------------------------------------------------
// Read a part of the file
bool CFastqReader::GetPart(uchar *&_part, uint64 &_size)
-{
+{
if(!in && !in_gzip && !in_bzip2)
return false;
-
-
- if(file_type == multiline_fasta)
- return GetPartFromMultilneFasta(_part,_size);
if(IsEof())
return false;
@@ -250,7 +193,7 @@ bool CFastqReader::GetPart(uchar *&_part, uint64 &_size)
}
// Look for the end of the last complete record in a buffer
- if(file_type == fasta) // FASTA files
+ if(file_type == CFilteringParams::file_type::fasta) // FASTA files
{
// Looking for a FASTA record at the end of the area
int64 line_start[3];
@@ -363,21 +306,20 @@ bool CFastqReader::IsEof()
//************************************************************************************************************
// CWFastqReader - wrapper for multithreading purposes
//************************************************************************************************************
-CWFastqReader::CWFastqReader(CKMCParams &Params, CKMCQueues &Queues)
-{
- mm = Queues.mm;
- pmm_fastq = Queues.pmm_fastq;
+CWFastqReader::CWFastqReader(CFilteringParams &Params, CFilteringQueues &Queues)
+{
+ pmm_fastq = Queues.pmm_fastq_reader;
input_files_queue = Queues.input_files_queue;
part_size = Params.fastq_buffer_size;
- part_queue = Queues.part_queue;
- file_type = Params.file_type;
- kmer_len = Params.p_k;
+ part_queue = Queues.input_part_queue;
+ file_type = Params.input_file_type;
+ kmer_len = Params.kmer_len;
gzip_buffer_size = Params.gzip_buffer_size;
bzip2_buffer_size = Params.bzip2_buffer_size;
- fqr = NULL;
+ fqr = nullptr;
}
//----------------------------------------------------------------------------------
@@ -393,7 +335,7 @@ void CWFastqReader::operator()()
while(input_files_queue->pop(file_name))
{
- fqr = new CFastqReader(mm, pmm_fastq, file_type, gzip_buffer_size, bzip2_buffer_size, kmer_len);
+ fqr = new CFastqReader(pmm_fastq, file_type, gzip_buffer_size, bzip2_buffer_size, kmer_len);
fqr->SetNames(file_name);
fqr->SetPartSize(part_size);
@@ -410,65 +352,4 @@ void CWFastqReader::operator()()
part_queue->mark_completed();
}
-
-
-//************************************************************************************************************
-// CWStatsFastqReader - wrapper for multithreading purposes
-//************************************************************************************************************
-CWStatsFastqReader::CWStatsFastqReader(CKMCParams &Params, CKMCQueues &Queues)
-{
- mm = Queues.mm;
- pmm_fastq = Queues.pmm_fastq;
-
- input_files_queue = Queues.input_files_queue;
- part_size = Params.fastq_buffer_size;
- stats_part_queue = Queues.stats_part_queue;
- file_type = Params.file_type;
- kmer_len = Params.p_k;
-
- gzip_buffer_size = Params.gzip_buffer_size;
- bzip2_buffer_size = Params.bzip2_buffer_size;
-
- fqr = NULL;
-}
-
-//----------------------------------------------------------------------------------
-CWStatsFastqReader::~CWStatsFastqReader()
-{
-}
-
-//----------------------------------------------------------------------------------
-void CWStatsFastqReader::operator()()
-{
- uchar *part;
- uint64 part_filled;
- bool finished = false;
- while (input_files_queue->pop(file_name) && !finished)
- {
- fqr = new CFastqReader(mm, pmm_fastq, file_type, gzip_buffer_size, bzip2_buffer_size, kmer_len);
- fqr->SetNames(file_name);
- fqr->SetPartSize(part_size);
-
- if (fqr->OpenFiles())
- {
- // Reading Fastq parts
- while (fqr->GetPart(part, part_filled))
- {
- if (!stats_part_queue->push(part, part_filled))
- {
- finished = true;
- pmm_fastq->free(part);
- break;
- }
-
- }
- }
- else
- cerr << "Error: Cannot open file " << file_name << "\n";
- delete fqr;
- }
- stats_part_queue->mark_completed();
-}
-
-
// ***** EOF
diff --git a/kmer_counter/fastq_reader.h b/kmc_tools/fastq_reader.h
similarity index 58%
copy from kmer_counter/fastq_reader.h
copy to kmc_tools/fastq_reader.h
index 5132d23..98e1e9b 100644
--- a/kmer_counter/fastq_reader.h
+++ b/kmc_tools/fastq_reader.h
@@ -2,17 +2,18 @@
This file is a part of KMC software distributed under GNU GPL 3 licence.
The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
- Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
+ Authors: Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#ifndef _FASTQ_READER_H
#define _FASTQ_READER_H
#include "defs.h"
-#include "params.h"
+#include "queues.h"
+#include "config.h"
#include <stdio.h>
#include <iostream>
@@ -22,17 +23,17 @@
using namespace std;
+
//************************************************************************************************************
// FASTA/FASTQ reader class
//************************************************************************************************************
class CFastqReader {
typedef enum {m_plain, m_gzip, m_bzip2} t_mode;
- CMemoryMonitor *mm;
CMemoryPool *pmm_fastq;
string input_file_name;
- input_type file_type;
+ CFilteringParams::file_type file_type;
int kmer_len;
t_mode mode;
@@ -48,15 +49,14 @@ class CFastqReader {
uint32 gzip_buffer_size;
uint32 bzip2_buffer_size;
-
- bool containsNextChromosome; //for multiline_fasta processing
+
bool SkipNextEOL(uchar *part, int64 &pos, int64 max_pos);
bool IsEof();
public:
- CFastqReader(CMemoryMonitor *_mm, CMemoryPool *_pmm_fastq, input_type _file_type, uint32 _gzip_buffer_size, uint32 _bzip2_buffer_size, int _kmer_len);
+ CFastqReader(CMemoryPool *_pmm_fastq, CFilteringParams::file_type _file_type, uint32 _gzip_buffer_size, uint32 _bzip2_buffer_size, int _kmer_len);
~CFastqReader();
static uint64 OVERHEAD_SIZE;
@@ -64,8 +64,7 @@ public:
bool SetNames(string _input_file_name);
bool SetPartSize(uint64 _part_size);
bool OpenFiles();
-
- bool GetPartFromMultilneFasta(uchar *&_part, uint64 &_size);
+
bool GetPart(uchar *&_part, uint64 &_size);
};
@@ -73,7 +72,6 @@ public:
// Wrapper for FASTA/FASTQ reader class - for multithreading purposes
//************************************************************************************************************
class CWFastqReader {
- CMemoryMonitor *mm;
CMemoryPool *pmm_fastq;
CFastqReader *fqr;
@@ -81,43 +79,18 @@ class CWFastqReader {
uint64 part_size;
CInputFilesQueue *input_files_queue;
CPartQueue *part_queue;
- input_type file_type;
+ CFilteringParams::file_type file_type;
uint32 gzip_buffer_size;
uint32 bzip2_buffer_size;
int kmer_len;
public:
- CWFastqReader(CKMCParams &Params, CKMCQueues &Queues);
+ CWFastqReader(CFilteringParams &Params, CFilteringQueues &Queues);
~CWFastqReader();
void operator()();
};
-
-
-//************************************************************************************************************
-// Wrapper for FASTA/FASTQ reader class (stats mode) - for multithreading purposes
-//************************************************************************************************************
-class CWStatsFastqReader {
- CMemoryMonitor *mm;
- CMemoryPool *pmm_fastq;
-
- CFastqReader *fqr;
- string file_name;
- uint64 part_size;
- CInputFilesQueue *input_files_queue;
- CStatsPartQueue *stats_part_queue;
- input_type file_type;
- uint32 gzip_buffer_size;
- uint32 bzip2_buffer_size;
- int kmer_len;
-
-public:
- CWStatsFastqReader(CKMCParams &Params, CKMCQueues &Queues);
- ~CWStatsFastqReader();
-
- void operator()();
-};
#endif
-// ***** EOF
+// ***** EOF
\ No newline at end of file
diff --git a/kmc_tools/fastq_writer.cpp b/kmc_tools/fastq_writer.cpp
new file mode 100644
index 0000000..68b8f6f
--- /dev/null
+++ b/kmc_tools/fastq_writer.cpp
@@ -0,0 +1,69 @@
+/*
+ This file is a part of KMC software distributed under GNU GPL 3 licence.
+ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+ Authors: Marek Kokot
+
+ Version: 2.3.0
+ Date : 2015-08-21
+*/
+
+#include "stdafx.h"
+#include "fastq_writer.h"
+#include <iostream>
+using namespace std;
+
+
+/*****************************************************************************************************************************/
+/******************************************************** CONSTRUCTOR ********************************************************/
+/*****************************************************************************************************************************/
+CFastqWriter::CFastqWriter(CFilteringParams& Params, CFilteringQueues& Queues)
+{
+ output_src = Params.output_src;
+ filtered_part_queue = Queues.filtered_part_queue;
+ pmm_fastq_filter = Queues.pmm_fastq_filter;
+}
+
+/*****************************************************************************************************************************/
+/********************************************************** PUBLIC ***********************************************************/
+/*****************************************************************************************************************************/
+void CFastqWriter::Process()
+{
+ uchar* part;
+ uint64 size;
+ FILE* f = fopen(output_src.c_str(), "wb");
+ if (!f)
+ {
+ cout << "cannot open file :" << output_src;
+ exit(1);
+ }
+ while (filtered_part_queue->pop(part, size))
+ {
+ if (fwrite(part, 1, size, f) != size)
+ {
+ cout << "Error while writing to " << output_src << "\n";
+ exit(1);
+ }
+ pmm_fastq_filter->free(part);
+ }
+ fclose(f);
+}
+
+/*****************************************************************************************************************************/
+/******************************************************** CONSTRUCTOR ********************************************************/
+/*****************************************************************************************************************************/
+CWFastqWriter::CWFastqWriter(CFilteringParams& Params, CFilteringQueues& Queues)
+ :writer(Params, Queues)
+{
+
+}
+
+/*****************************************************************************************************************************/
+/********************************************************** PUBLIC ***********************************************************/
+/*****************************************************************************************************************************/
+void CWFastqWriter::operator()()
+{
+ writer.Process();
+}
+
+// ***** EOF
\ No newline at end of file
diff --git a/kmc_tools/fastq_writer.h b/kmc_tools/fastq_writer.h
new file mode 100644
index 0000000..8ac9a5d
--- /dev/null
+++ b/kmc_tools/fastq_writer.h
@@ -0,0 +1,43 @@
+/*
+ This file is a part of KMC software distributed under GNU GPL 3 licence.
+ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+ Authors: Marek Kokot
+
+ Version: 2.3.0
+ Date : 2015-08-21
+*/
+
+#ifndef _FASTQ_WRITER_H
+#define _FASTQ_WRITER_H
+
+#include "defs.h"
+#include "config.h"
+#include <string>
+
+//************************************************************************************************************
+// CFastqWriter - Writer of fastq/fasta file
+//************************************************************************************************************
+class CFastqWriter
+{
+ std::string output_src;
+ CPartQueue* filtered_part_queue;
+ CMemoryPool *pmm_fastq_filter;
+public:
+ CFastqWriter(CFilteringParams& Params, CFilteringQueues& Queues);
+ void Process();
+};
+
+//************************************************************************************************************
+// CWFastqWriter - wrapper for CFastqWriter class - for multithreading purposes
+//************************************************************************************************************
+class CWFastqWriter
+{
+ CFastqWriter writer;
+public:
+ CWFastqWriter(CFilteringParams& Params, CFilteringQueues& Queues);
+ void operator()();
+};
+
+#endif
+
diff --git a/kmc_tools/histogram_writer.h b/kmc_tools/histogram_writer.h
new file mode 100644
index 0000000..1d97490
--- /dev/null
+++ b/kmc_tools/histogram_writer.h
@@ -0,0 +1,54 @@
+/*
+ This file is a part of KMC software distributed under GNU GPL 3 licence.
+ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+ Authors: Marek Kokot
+
+ Version: 2.3.0
+ Date : 2015-08-21
+*/
+
+#ifndef _HISTOGRAM_WRITER_H
+#define _HISTOGRAM_WRITER_H
+
+#include "defs.h"
+#include "config.h"
+#include <vector>
+#include <fstream>
+
+template<typename KMCDB> class CHistogramWriter
+{
+ KMCDB& kmcdb;
+ COutputDesc& output_desc;
+ std::vector<uint32> counters;
+
+public:
+ CHistogramWriter(KMCDB& kmcdb) :kmcdb(kmcdb), output_desc(CConfig::GetInstance().output_desc)
+ {
+
+ }
+ bool Process()
+ {
+ counters.resize(output_desc.cutoff_max + 1);
+ uint32 counter;
+ while (kmcdb.NextCounter(counter))
+ {
+ if (counter >= output_desc.cutoff_min && counter <= output_desc.cutoff_max)
+ counters[counter]++;
+ }
+ std::ofstream file(output_desc.file_src);
+ if (!file)
+ {
+ std::cout << "Error: cannot open file: " << output_desc.file_src << "\n";
+ exit(1);
+ }
+ for (uint32 i = output_desc.cutoff_min; i <= output_desc.cutoff_max; ++i)
+ {
+ file << i << "\t" << counters[i] << "\n";
+ }
+ file.close();
+ return true;
+ }
+};
+
+#endif
\ No newline at end of file
diff --git a/kmc_tools/kmc1_db_reader.h b/kmc_tools/kmc1_db_reader.h
new file mode 100644
index 0000000..e25530f
--- /dev/null
+++ b/kmc_tools/kmc1_db_reader.h
@@ -0,0 +1,379 @@
+/*
+ This file is a part of KMC software distributed under GNU GPL 3 licence.
+ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+ Authors: Marek Kokot
+
+ Version: 2.3.0
+ Date : 2015-08-21
+*/
+
+#ifndef _KMC1_DB_READER_H
+#define _KMC1_DB_READER_H
+#include "kmer.h"
+#include "defs.h"
+#include "config.h"
+#include "bundle.h"
+#include "kmc_header.h"
+#include "queues.h"
+#include <iostream>
+#include <cstring>
+#include <thread>
+
+enum class KMCDBOpenMode { sequential, sorted, counters_only };
+
+//************************************************************************************************************
+// CKMC1DbReader - reader of KMC1 database
+//************************************************************************************************************
+template<unsigned SIZE> class CKMC1DbReader : public CInput<SIZE>
+{
+public:
+ CKMC1DbReader(const CKMC_header& header, const CInputDesc& desc, CPercentProgress& percent_progress, KMCDBOpenMode open_mode);
+
+ void NextBundle(CBundle<SIZE>& bundle) override
+ {
+ bool exists = circular_queue->pop(bundle.Data());
+
+ percent_progress.UpdateItem(progress_id, bundle.Size());
+
+ if (exists)
+ return;
+
+ percent_progress.Complete(progress_id);
+
+ this->finished = true;
+ this->sorted_access_thread.join();
+ delete this->circular_queue;
+ }
+
+ void IgnoreRest() override
+ {
+ circular_queue->force_finish();
+ this->finished = true;
+ this->sorted_access_thread.join();
+ delete this->circular_queue;
+ }
+
+ ~CKMC1DbReader()
+ {
+ if(prefix_file != nullptr)
+ fclose(prefix_file);
+ if(sufix_file != nullptr)
+ fclose(sufix_file);
+ delete[] prefix_buff;
+ delete[] sufix_buff;
+ }
+
+ bool NextKmerSequential(CKmer<SIZE>& kmer, uint32& counter)
+ {
+ if (next_kmer_sorted(kmer, counter))
+ {
+ percent_progress.UpdateItem(progress_id);
+ return true;
+ }
+ percent_progress.Complete(progress_id);
+ return false;
+ }
+
+ bool NextCounter(uint32& counter);
+
+private:
+ static const uint32 PREFIX_BUFF_BYTES = KMC1_DB_READER_PREFIX_BUFF_BYTES;
+ static const uint32 SUFIX_BUFF_BYTES = KMC1_DB_READER_SUFIX_BUFF_BYTES;
+ const CKMC_header& header;
+ const CInputDesc& desc;
+
+ CPercentProgress& percent_progress;
+ KMCDBOpenMode open_mode;
+
+ uint32 progress_id;
+
+ FILE* prefix_file;
+ FILE* sufix_file;
+
+ uint32 record_size; //of sufix, in bytes
+ uint32 current_preffix;
+ uint32 sufix_bytes;
+ uint64* prefix_buff = nullptr;
+ uchar* sufix_buff = nullptr;
+
+ uint32 prefix_bytes;
+ uint32 kmer_bytes;
+
+ uint64 prefix_buff_size;
+ uint64 sufix_buff_size;
+
+ uint64 prefix_buff_pos;
+ uint64 sufix_buff_pos;
+
+ uint64 prefix_left_to_read;
+ uint64 sufix_left_to_read;
+
+ std::string prefix_file_name;
+ std::string sufix_file_name;
+
+ uint64 sufix_number;
+
+ CCircularQueue<SIZE>* circular_queue = nullptr; //for sorted access only
+ std::thread sorted_access_thread;
+
+ void reload_pref_buff();
+
+ bool reload_suf_buff();
+
+ bool next_kmer_sorted(CKmer<SIZE>& kmer, uint32& counter);
+
+ void open_files();
+
+ void allocate_buffers()
+ {
+ sufix_buff = new uchar[sufix_buff_size];
+ if (open_mode == KMCDBOpenMode::sequential || open_mode == KMCDBOpenMode::sorted)
+ prefix_buff = new uint64[prefix_buff_size];
+ }
+};
+
+/*****************************************************************************************************************************/
+/******************************************************** CONSTRUCTOR ********************************************************/
+/*****************************************************************************************************************************/
+
+template<unsigned SIZE> CKMC1DbReader<SIZE>::CKMC1DbReader(const CKMC_header& header, const CInputDesc& desc, CPercentProgress& percent_progress, KMCDBOpenMode open_mode) :
+ header(header), desc(desc), percent_progress(percent_progress), open_mode(open_mode)
+{
+ progress_id = percent_progress.RegisterItem(header.total_kmers);
+
+ prefix_file = sufix_file = nullptr;
+ sufix_bytes = (header.kmer_len - header.lut_prefix_len) / 4;
+ record_size = sufix_bytes + header.counter_size;
+ sufix_buff_size = SUFIX_BUFF_BYTES / record_size * record_size;
+ prefix_buff_size = PREFIX_BUFF_BYTES / sizeof(uint64);
+
+ sufix_left_to_read = header.total_kmers * record_size;
+
+ if (sufix_left_to_read < sufix_buff_size)
+ sufix_buff_size = sufix_left_to_read;
+
+ prefix_left_to_read = (1 << header.lut_prefix_len * 2) - 1;
+
+ if (prefix_left_to_read < prefix_buff_size)
+ prefix_buff_size = prefix_left_to_read;
+
+ prefix_bytes = (header.lut_prefix_len + 3) / 4;
+
+ kmer_bytes = prefix_bytes + sufix_bytes;
+
+ open_files();
+ allocate_buffers();
+ if(open_mode == KMCDBOpenMode::sequential || open_mode == KMCDBOpenMode::sorted)
+ reload_pref_buff();
+ reload_suf_buff();
+
+ current_preffix = 0;
+ sufix_number = 0;
+
+ if (open_mode == KMCDBOpenMode::sorted)
+ {
+ circular_queue = new CCircularQueue<SIZE>(DEFAULT_CIRCULAL_QUEUE_CAPACITY);
+ sorted_access_thread = std::thread([this]{
+
+ CKmer<SIZE> kmer;
+ uint32 counter;
+ CBundleData<SIZE> bundle_data;
+
+ while (next_kmer_sorted(kmer, counter))
+ {
+ bundle_data.Insert(kmer, counter);
+ if (bundle_data.Full())
+ {
+ if (!this->circular_queue->push(bundle_data))
+ break;
+ }
+ }
+ if (!bundle_data.Empty())
+ this->circular_queue->push(bundle_data);
+ this->circular_queue->mark_completed();
+ });
+ }
+}
+
+/*****************************************************************************************************************************/
+/********************************************************** PUBLIC ***********************************************************/
+/*****************************************************************************************************************************/
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> bool CKMC1DbReader<SIZE>::NextCounter(uint32& counter)
+{
+ while (true)
+ {
+ if (sufix_number >= header.total_kmers)
+ return false;
+
+ uchar* record = sufix_buff + sufix_buff_pos + sufix_bytes;
+
+ counter = 0;
+ for (int32 i = header.counter_size - 1; i >= 0; --i)
+ {
+ counter <<= 8;
+ counter += record[i];
+ }
+
+ ++sufix_number;
+ sufix_buff_pos += record_size;
+
+ if (sufix_buff_pos >= sufix_buff_size)
+ reload_suf_buff();
+
+ if (counter >= desc.cutoff_min && counter <= desc.cutoff_max)
+ return true;
+ }
+}
+
+/*****************************************************************************************************************************/
+/********************************************************** PRIVATE **********************************************************/
+/*****************************************************************************************************************************/
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> void CKMC1DbReader<SIZE>::reload_pref_buff()
+{
+ uint64 to_read = MIN(prefix_left_to_read, prefix_buff_size);
+ prefix_buff_pos = 0;
+ if (to_read == 0)
+ {
+ prefix_buff[0] = header.total_kmers;//guard
+ return;
+ }
+
+ if (fread(prefix_buff, sizeof(uint64), to_read, prefix_file) != to_read)
+ {
+ std::cout << "Error: some error while reading " << prefix_file_name << "\n";
+ exit(1);
+ }
+ prefix_left_to_read -= to_read;
+ if (to_read < prefix_buff_size)
+ {
+ prefix_buff[to_read] = header.total_kmers;//guard
+ }
+}
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> bool CKMC1DbReader<SIZE>::reload_suf_buff()
+{
+ uint64 to_read = MIN(sufix_left_to_read, sufix_buff_size);
+ if (to_read == 0)
+ return false;
+ uint64 readed = fread(sufix_buff, 1, to_read, sufix_file);
+ if (readed != to_read)
+ {
+ std::cout << "Error: some error while reading " << sufix_file_name << "\n";
+ exit(1);
+ }
+ sufix_buff_pos = 0;
+ sufix_left_to_read -= to_read;
+ return true;
+}
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> void CKMC1DbReader<SIZE>::open_files()
+{
+
+ sufix_file_name = desc.file_src + ".kmc_suf";
+
+ sufix_file = fopen(sufix_file_name.c_str(), "rb");
+ setvbuf(sufix_file, NULL, _IONBF, 0);
+
+ if (!sufix_file)
+ {
+ std::cout << "Error: cannot open file: " << sufix_file_name << "\n";
+ exit(1);
+ }
+
+ char marker[4];
+ if (fread(marker, 1, 4, sufix_file) != 4)
+ {
+ std::cout << "Error: while reading start marker in file: " << sufix_file_name << "\n";
+ exit(1);
+ }
+
+ if (strncmp(marker, "KMCS", 4) != 0)
+ {
+ std::cout << "Error: wrong start marker in file: " << sufix_file_name << "\n";
+ exit(1);
+ }
+
+
+ my_fseek(sufix_file, -4, SEEK_END);
+ if (fread(marker, 1, 4, sufix_file) != 4)
+ {
+ std::cout << "Error: while reading end marker in file: " << sufix_file_name << "\n";
+ exit(1);
+ }
+
+ if (strncmp(marker, "KMCS", 4) != 0)
+ {
+ std::cout << "Error: wrong end marker in file: " << sufix_file_name << "\n";
+ exit(1);
+ }
+ my_fseek(sufix_file, 4, SEEK_SET); //skip KMCS
+
+ if (open_mode == KMCDBOpenMode::sequential || open_mode == KMCDBOpenMode::sorted)
+ {
+ prefix_file_name = desc.file_src + ".kmc_pre";
+
+ prefix_file = fopen(prefix_file_name.c_str(), "rb");
+ setvbuf(prefix_file, NULL, _IONBF, 0);
+
+ if (!prefix_file)
+ {
+ std::cout << "Error: cannot open file: " << prefix_file_name << "\n";
+ exit(1);
+ }
+ my_fseek(prefix_file, 4 + sizeof(uint64), SEEK_SET);//skip KMCP and first value as it must be 0
+ }
+}
+
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> bool CKMC1DbReader<SIZE>::next_kmer_sorted(CKmer<SIZE>& kmer, uint32& counter)
+{
+ while (true)
+ {
+ if (sufix_number >= header.total_kmers)
+ return false;
+
+ while (prefix_buff[prefix_buff_pos] <= sufix_number)
+ {
+ ++current_preffix;
+ ++prefix_buff_pos;
+ if (prefix_buff_pos >= prefix_buff_size)
+ reload_pref_buff();
+ }
+
+ uchar* record = sufix_buff + sufix_buff_pos;
+ uint32 pos = kmer_bytes - 1;
+
+ kmer.load(record, sufix_bytes);
+ for (int32 i = prefix_bytes - 1; i >= 0; --i)
+ kmer.set_byte(pos--, current_preffix >> (i << 3));
+
+ counter = 0;
+ for (int32 i = header.counter_size - 1; i >= 0; --i)
+ {
+ counter <<= 8;
+ counter += record[i];
+ }
+
+ ++sufix_number;
+ sufix_buff_pos += record_size;
+
+ if (sufix_buff_pos >= sufix_buff_size)
+ reload_suf_buff();
+
+ if (counter >= desc.cutoff_min && counter <= desc.cutoff_max)
+ return true;
+ }
+}
+
+
+#endif
+
+// ***** EOF
\ No newline at end of file
diff --git a/kmc_tools/kmc1_db_writer.h b/kmc_tools/kmc1_db_writer.h
new file mode 100644
index 0000000..bc15de2
--- /dev/null
+++ b/kmc_tools/kmc1_db_writer.h
@@ -0,0 +1,380 @@
+/*
+ This file is a part of KMC software distributed under GNU GPL 3 licence.
+ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+ Authors: Marek Kokot
+
+ Version: 2.3.0
+ Date : 2015-08-21
+*/
+
+#ifndef _KMC1_DB_WRITER_H
+#define _KMC1_DB_WRITER_H
+
+#include "defs.h"
+#include "config.h"
+#include "queues.h"
+
+#include <string>
+#include <vector>
+
+//************************************************************************************************************
+// CKMC1SufixFileWriter - thread for writing sufixes' parts
+//************************************************************************************************************
+class CKMC1SufixFileWriter
+{
+public:
+ CKMC1SufixFileWriter(CSufWriteQueue& input_queue, FILE* kmc_suf) :
+ input_queue(input_queue),
+ kmc_suf(kmc_suf)
+ {
+ }
+ void operator()()
+ {
+ uchar* buf;
+ uint32 size;
+ while (input_queue.pop(buf, size))
+ {
+ if (fwrite(buf, 1, size, kmc_suf) != size)
+ {
+ std::cout << "Error while writting to kmc_suf file\n";
+ exit(1);
+ }
+ delete[] buf;
+ }
+ }
+private:
+ CSufWriteQueue& input_queue;
+ FILE* kmc_suf;
+};
+
+//************************************************************************************************************
+// CKMC1DbWriter - writer of KMC1 database
+//************************************************************************************************************
+template<unsigned SIZE> class CKMC1DbWriter
+{
+public:
+ CKMC1DbWriter(CBundle<SIZE>* bundle);
+ ~CKMC1DbWriter();
+ bool Process();
+
+private:
+ static const uint32 PRE_BUFF_SIZE_BYTES = KMC1_DB_WRITER_PREFIX_BUFF_BYTES;
+ static const uint32 SUF_BUFF_SIZE_BYTES = KMC1_DB_WRITER_SUFIX_BUFF_BYTES;
+
+ CConfig& config;
+ CBundle<SIZE>* bundle;
+ FILE* kmc_pre, *kmc_suf;
+ uint32 lut_prefix_len;
+ uint32 current_prefix;
+ uint32 counter_size;
+ uint32 pre_buff_size;
+ uint32 suf_buff_size;
+ uint64* pre_buff;
+ uchar* suf_buff;
+ uint64 added_kmers;
+ uint32 sufix_rec_bytes;
+ uint32 suf_pos, pre_pos;
+
+ void store_pre_buf();
+ void send_suf_buf_to_queue();
+ void start_writting();
+ inline void add_kmer(CKmer<SIZE>& kmer, uint32 counter);
+ void finish_writting();
+
+ template<typename T> void write_header_part(T data);
+ void calc_lut_prefix_len();
+
+
+ CCircularQueue<SIZE> bundles_queue;
+ CSufWriteQueue suf_buf_queue;
+
+};
+
+/*****************************************************************************************************************************/
+/******************************************************** CONSTRUCTOR ********************************************************/
+/*****************************************************************************************************************************/
+
+/*****************************************************************************************************************************/
+template <unsigned SIZE> CKMC1DbWriter<SIZE>::CKMC1DbWriter(CBundle<SIZE>* bundle) :
+ config(CConfig::GetInstance()),
+ bundle(bundle),
+ bundles_queue(DEFAULT_CIRCULAL_QUEUE_CAPACITY)
+{
+ kmc_pre = NULL;
+ kmc_suf = NULL;
+ pre_buff = NULL;
+ suf_buff = NULL;
+ std::string kmc_pre_file_name = config.output_desc.file_src + ".kmc_pre";
+ std::string kmc_suf_file_name = config.output_desc.file_src + ".kmc_suf";
+
+ kmc_pre = fopen(kmc_pre_file_name.c_str(), "wb");
+ setvbuf(kmc_pre, NULL, _IONBF, 0);
+
+ if (!kmc_pre)
+ {
+ std::cout << "Error: cannot open file : " << kmc_pre_file_name << "\n";
+ exit(1);
+ }
+ kmc_suf = fopen(kmc_suf_file_name.c_str(), "wb");
+ setvbuf(kmc_suf, NULL, _IONBF, 0);
+
+ if (!kmc_suf)
+ {
+ fclose(kmc_pre);
+ std::cout << "Error: cannot open file : " << kmc_suf_file_name << "\n";
+ exit(1);
+ }
+
+ setvbuf(kmc_pre, NULL, _IONBF, 0);
+ setvbuf(kmc_suf, NULL, _IONBF, 0);
+ // Calculate LUT size
+
+
+
+ calc_lut_prefix_len();
+
+ counter_size = MIN(BYTE_LOG(config.output_desc.counter_max), BYTE_LOG(config.output_desc.cutoff_max));
+ sufix_rec_bytes = (config.kmer_len - lut_prefix_len) / 4 + counter_size;
+ current_prefix = 0;
+ added_kmers = 0;
+ pre_buff_size = PRE_BUFF_SIZE_BYTES / sizeof(uint64);
+ suf_buff_size = SUF_BUFF_SIZE_BYTES / sufix_rec_bytes;
+ suf_pos = pre_pos = 0;
+
+ pre_buff = new uint64[pre_buff_size];
+ pre_buff[pre_pos++] = 0;
+ suf_buff = new uchar[suf_buff_size * sufix_rec_bytes];
+
+
+ suf_buf_queue.init(suf_buff_size * sufix_rec_bytes, SUFIX_WRITE_QUEUE_CAPACITY);
+
+}
+
+/*****************************************************************************************************************************/
+/********************************************************** PUBLIC ***********************************************************/
+/*****************************************************************************************************************************/
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> bool CKMC1DbWriter<SIZE>::Process()
+{
+
+ start_writting();
+
+ //Converts bundles to output buffers, sufix buffer is placed to another queue and write in separate thread (sufix_writer)
+ std::thread preparing_thread([this]{
+ CBundleData<SIZE> bundle_data;
+ while (bundles_queue.pop(bundle_data))
+ {
+ while (!bundle_data.Empty())
+ {
+ add_kmer(bundle_data.TopKmer(), bundle_data.TopCounter());
+ bundle_data.Pop();
+ }
+ }
+ suf_buf_queue.push(suf_buff, sufix_rec_bytes * suf_pos);
+ suf_buf_queue.mark_completed();
+ });
+
+
+
+ CKMC1SufixFileWriter sufix_writer(suf_buf_queue, kmc_suf);
+ std::thread suf_buf_writing_thread(std::ref(sufix_writer));
+
+#ifdef ENABLE_LOGGER
+ CTimer timer;
+
+#endif
+ while (!bundle->Finished())
+ {
+#ifdef ENABLE_LOGGER
+ timer.start();
+#endif
+ bundles_queue.push(bundle->Data());
+#ifdef ENABLE_LOGGER
+ CLoger::GetLogger().log_operation("dodawanie do kolejki wyjsciowej bundla", this, timer.get_time());
+#endif
+ }
+
+ bundles_queue.mark_completed();
+
+ preparing_thread.join();
+ suf_buf_writing_thread.join();
+
+ finish_writting();
+ return true;
+}
+
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> CKMC1DbWriter<SIZE>::~CKMC1DbWriter()
+{
+ delete[] suf_buff;
+ delete[] pre_buff;
+}
+
+/*****************************************************************************************************************************/
+/********************************************************** PRIVATE **********************************************************/
+/*****************************************************************************************************************************/
+
+/*****************************************************************************************************************************/
+template <unsigned SIZE> template <typename T> void CKMC1DbWriter<SIZE>::write_header_part(T data)
+{
+ for (uint32 i = 0; i < sizeof(T); ++i)
+ {
+ char c = (data >> (i << 3)) & 0xff;
+ if (putc(c, kmc_pre) == EOF)
+ {
+ std::cout << "Error while writing header of kmc1\n";
+ exit(1);
+ }
+ }
+}
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> void CKMC1DbWriter<SIZE>::start_writting()
+{
+ if (fwrite("KMCP", 1, 4, kmc_pre) != 4)
+ {
+ std::cout << "Error while writting starting KMCP marker";
+ exit(1);
+ }
+ if (fwrite("KMCS", 1, 4, kmc_suf) != 4)
+ {
+ std::cout << "Error while writting starting KMCS marker";
+ exit(1);
+ }
+}
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> void CKMC1DbWriter<SIZE>::finish_writting()
+{
+ uint32 max_prefix = (1 << 2 * lut_prefix_len);
+ while (current_prefix < max_prefix - 1)
+ {
+ pre_buff[pre_pos++] = added_kmers;
+ ++current_prefix;
+ if (pre_pos == pre_buff_size)
+ store_pre_buf();
+ }
+ store_pre_buf();
+ send_suf_buf_to_queue();
+
+ //store header
+ write_header_part(config.kmer_len);
+ write_header_part(config.headers.front().mode);
+ write_header_part(counter_size);
+ write_header_part(lut_prefix_len);
+ write_header_part(config.output_desc.cutoff_min);
+ write_header_part(config.output_desc.cutoff_max);
+ write_header_part(added_kmers);
+
+ bool both_stands = false;
+ for (auto& input : config.headers)
+ both_stands = both_stands || input.both_strands; //if any input database is in both strands, output is also in both strands
+
+ write_header_part(!both_stands);
+
+
+ for (uint32 i = 0; i < 31; ++i)
+ write_header_part(uchar(0));
+
+ write_header_part((uint32)64);
+
+
+ if (fwrite("KMCP", 1, 4, kmc_pre) != 4)
+ {
+ std::cout << "Error while writting end KMCP marker";
+ exit(1);
+ }
+ if (fwrite("KMCS", 1, 4, kmc_suf) != 4)
+ {
+ std::cout << "Error while writting end KMCS marker";
+ exit(1);
+ }
+ fclose(kmc_pre);
+ fclose(kmc_suf);
+}
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> void CKMC1DbWriter<SIZE>::add_kmer(CKmer<SIZE>& kmer, uint32 counter)
+{
+ if (counter < config.output_desc.cutoff_min || counter > config.output_desc.cutoff_max)
+ return;
+ if (counter > config.output_desc.counter_max)
+ counter = config.output_desc.counter_max;
+ uint64 kmer_prefix = kmer.remove_suffix((config.kmer_len - lut_prefix_len) * 2);
+ while (current_prefix < kmer_prefix)
+ {
+ pre_buff[pre_pos++] = added_kmers;
+ ++current_prefix;
+ if (pre_pos == pre_buff_size)
+ store_pre_buf();
+ }
+ uchar* rec = suf_buff + suf_pos * sufix_rec_bytes;
+
+ kmer.store(rec, sufix_rec_bytes - counter_size);
+ for (uint32 i = 0; i < counter_size; ++i)
+ *rec++ = counter >> (i << 3);
+ ++suf_pos;
+ if (suf_pos == suf_buff_size)
+ send_suf_buf_to_queue();
+ ++added_kmers;
+}
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> void CKMC1DbWriter<SIZE>::store_pre_buf()
+{
+ if (fwrite(pre_buff, sizeof(uint64), pre_pos, kmc_pre) != pre_pos)
+ {
+ std::cout << "Error while writting to kmc_pre file\n";
+ exit(1);
+ }
+ pre_pos = 0;
+}
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> void CKMC1DbWriter<SIZE>::send_suf_buf_to_queue()
+{
+ suf_buf_queue.push(suf_buff, sufix_rec_bytes * suf_pos);
+ suf_pos = 0;
+}
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> void CKMC1DbWriter<SIZE>::calc_lut_prefix_len()
+{
+
+
+ std::vector<uint32> best_lut_prefix_len_inputs(config.headers.size());
+
+
+ for (uint32 i = 0; i < config.headers.size(); ++i)
+ {
+ uint32 best_lut_prefix_len = 0;
+ uint64 best_mem_amount = 1ull << 62;
+ for (lut_prefix_len = 6; lut_prefix_len < 16; ++lut_prefix_len)
+ {
+ uint32 suffix_len = config.headers[i].kmer_len - lut_prefix_len;
+ if (suffix_len % 4)
+ continue;
+
+ uint64 suf_mem = config.headers[i].total_kmers * suffix_len / 4;
+ uint64 lut_mem = (1ull << (2 * lut_prefix_len)) * sizeof(uint64);
+
+ if (suf_mem + lut_mem < best_mem_amount)
+ {
+ best_lut_prefix_len = lut_prefix_len;
+ best_mem_amount = suf_mem + lut_mem;
+ }
+ }
+ best_lut_prefix_len_inputs[i] = best_lut_prefix_len;
+ }
+
+ //TODO poki co jako lut size biore najwieszy z najlepszych dla baz wejsciowych
+ lut_prefix_len = *std::max_element(best_lut_prefix_len_inputs.begin(), best_lut_prefix_len_inputs.end());
+}
+
+#endif
+
+
+// ***** EOF
\ No newline at end of file
diff --git a/kmc_tools/kmc2_db_reader.h b/kmc_tools/kmc2_db_reader.h
new file mode 100644
index 0000000..2e0acca
--- /dev/null
+++ b/kmc_tools/kmc2_db_reader.h
@@ -0,0 +1,1398 @@
+/*
+ This file is a part of KMC software distributed under GNU GPL 3 licence.
+ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+ Authors: Marek Kokot
+
+ Version: 2.3.0
+ Date : 2015-08-21
+*/
+
+#ifndef _KMC2_DB_READER_H
+#define _KMC2_DB_READER_H
+
+#include "config.h"
+#include "bundle.h"
+#include "queues.h"
+#include <vector>
+#include <mutex>
+#include <memory>
+#include <tuple>
+
+//#include <stack>
+#include <queue>
+
+#include <condition_variable>
+
+
+//Forward declaration
+template<unsigned SIZE> class CKMC2DbReaderSorted;
+
+template<unsigned SIZE> class CBin;
+
+
+struct CBinBuff //must be moveable
+{
+ uchar* buf;
+ uint32 size;
+
+ CBinBuff() :
+ buf(nullptr), size(0)
+ {
+ }
+
+ CBinBuff(uchar* buf, uint32 size) :buf(buf), size(size)
+ {
+
+ }
+
+#ifdef WIN32
+ CBinBuff& operator=(CBinBuff&& rhs) throw()
+#else
+ CBinBuff& operator=(CBinBuff&& rhs) noexcept
+#endif
+ {
+ if (this != &rhs)
+ {
+ buf = rhs.buf;
+ size = rhs.size;
+ rhs.buf = nullptr;
+ rhs.size = 0;
+ }
+ return *this;
+ }
+
+#ifdef WIN32
+ CBinBuff(CBinBuff&& rhs) throw()
+#else
+ CBinBuff(CBinBuff&& rhs) noexcept
+#endif
+ {
+ buf = rhs.buf;
+ size = rhs.size;
+ rhs.buf = nullptr;
+ rhs.size = 0;
+ }
+
+ CBinBuff(const CBinBuff&) = delete;
+ CBinBuff& operator=(const CBinBuff&) = delete;
+};
+
+template<unsigned SIZE> class CBinBufProvider
+{
+ std::vector<CBinBuff> internal_bufs;
+ uchar *buf_bins, *buf_internal;
+ uint32 bins_left_to_read = 0;
+ uint32 max_bin_bytes;
+ uint32 rec_size;
+
+ using desc_t = std::tuple<uint64, uint64, bool>;//current_kmer, last_kmer, is_empty
+ using to_read_t = std::tuple<uint32, uint64, uchar*, uint32>;//bin_id, file_pos, bufer to read, size to read
+
+ std::vector<desc_t> desc;
+ //std::stack<to_read_t, std::vector<to_read_t>> to_read;
+ std::queue<to_read_t, std::list<to_read_t>> to_read;
+
+ mutable std::mutex mtx;
+ std::condition_variable cv_pop;
+ std::condition_variable cv_get_next_to_read;
+
+ bool forced_to_finish = false;
+
+public:
+ void init(std::vector<CBin<SIZE>>& bins);
+
+ void pop(uint32 bin_id, CBinBuff& bin_buf)
+ {
+ std::unique_lock<std::mutex> lck(mtx);
+ cv_pop.wait(lck, [this, bin_id]{return !std::get<2>(desc[bin_id]); });
+
+ std::swap(bin_buf, internal_bufs[bin_id]);
+ std::get<2>(desc[bin_id]) = true;
+
+ uint64 kmers_left = std::get<1>(desc[bin_id]) - std::get<0>(desc[bin_id]);
+ if (kmers_left)
+ {
+ uint32 kmers_to_read = (uint32)MIN(kmers_left, max_bin_bytes / rec_size);
+ internal_bufs[bin_id].size = kmers_to_read * rec_size;
+ bool was_empty = to_read.empty();
+ to_read.push(std::make_tuple(bin_id, 4 + std::get<0>(desc[bin_id]) * rec_size, internal_bufs[bin_id].buf, internal_bufs[bin_id].size));
+ std::get<0>(desc[bin_id]) += kmers_to_read;
+ if (was_empty)
+ cv_get_next_to_read.notify_all();
+ }
+ else
+ {
+ --bins_left_to_read;
+ if (!bins_left_to_read)
+ cv_get_next_to_read.notify_all();
+ }
+ }
+
+ void notify_bin_filled(uint32 bin_id)
+ {
+ std::lock_guard<std::mutex> lck(mtx);
+ std::get<2>(desc[bin_id]) = false;
+ cv_pop.notify_all();
+ }
+
+ bool get_next_to_read(uint32& bin_id, uint64& file_pos, uchar* &buf, uint32& size)
+ {
+ std::unique_lock<std::mutex> lck(mtx);
+ cv_get_next_to_read.wait(lck, [this]{return !to_read.empty() || !bins_left_to_read || forced_to_finish; });
+ if (forced_to_finish || (to_read.empty() && !bins_left_to_read))
+ return false;
+
+ std::tie(bin_id, file_pos, buf, size) = to_read.front();
+ to_read.pop();
+ return true;
+ }
+
+ void force_to_finish()
+ {
+ std::lock_guard<std::mutex> lck(mtx);
+ forced_to_finish = true;
+ cv_get_next_to_read.notify_all();
+ }
+
+ ~CBinBufProvider()
+ {
+ delete[] buf_bins;
+ delete[] buf_internal;
+ }
+};
+
+template<unsigned SIZE>
+class CSufBinReader
+{
+ CBinBufProvider<SIZE>& bin_provider;
+ FILE* suf_file;
+public:
+ CSufBinReader(CBinBufProvider<SIZE>& bin_provider, FILE* suf_file) :
+ bin_provider(bin_provider),
+ suf_file(suf_file)
+ {
+
+ }
+ void operator()()
+ {
+ uint32 bin_id;
+ uint64 file_pos;
+ uchar* buf;
+ uint32 size;
+#ifdef ENABLE_LOGGER
+ CTimer timer;
+#endif
+
+ while (bin_provider.get_next_to_read(bin_id, file_pos, buf, size))
+ {
+ my_fseek(suf_file, file_pos, SEEK_SET);
+#ifdef ENABLE_LOGGER
+ timer.start();
+#endif
+ if (fread(buf, 1, size, suf_file) != size)
+ {
+ std::cout << "Error while reading sufix file\n";
+ exit(1);
+ }
+#ifdef ENABLE_LOGGER
+ CLoger::GetLogger().log_operation("fread", this, timer.get_time());
+ timer.start();
+#endif
+ bin_provider.notify_bin_filled(bin_id);
+ }
+ }
+
+};
+
+
+template<unsigned SIZE> class CBin
+{
+public:
+ CBin(uint32 bin_id, uint64* LUT, CKMC2DbReaderSorted<SIZE>& kmc2_db);
+ bool NextKmer(CKmer<SIZE>& kmer, uint32& counter);
+
+ uint64 get_kmer_number()
+ {
+ return kmer_number;
+ }
+
+ uint64 get_kmer_number_end()
+ {
+ return kmer_number_end;
+ }
+
+ uint32 get_record_size()
+ {
+ return record_size;
+ }
+
+ void set_bin_buff(CBinBuff&& _bin_buff)
+ {
+ bin_buff = std::move(_bin_buff);
+ pos = bin_buff.size; //force reload
+ }
+
+
+#ifdef WIN32
+ //Because VS2013 does generate default move ctro here
+ CBin(CBin&& o) throw():
+ bin_id(o.bin_id),
+ bin_buff(std::move(o.bin_buff)),
+ LUT(o.LUT),
+ pos(o.pos),
+ bin_provider(o.bin_provider),
+ kmc2_db(o.kmc2_db),
+
+ kmer_number(o.kmer_number), kmer_number_end(o.kmer_number_end),
+ kmer_bytes(o.kmer_bytes), prefix_bytes(o.prefix_bytes), suffix_bytes(o.suffix_bytes), counter_size(o.counter_size), record_size(o.record_size),
+ prefix(o.prefix),
+ max_prefix(o.max_prefix)
+ {
+
+ }
+#else
+//g++ generate here move ctor automatically
+#endif
+
+
+
+
+private:
+ uint32 bin_id;
+ CBinBuff bin_buff;
+ uint64* LUT;
+ uint32 pos = 0;
+ CBinBufProvider<SIZE>& bin_provider;
+ CKMC2DbReaderSorted<SIZE>& kmc2_db;
+ void reload_suf_buf();
+ uint64 kmer_number, kmer_number_end;
+ uint32 kmer_bytes, prefix_bytes, suffix_bytes, counter_size, record_size;
+ uint64 prefix = 0;
+ uint64 max_prefix;
+};
+
+
+
+
+
+
+
+template<unsigned SIZE> void CBinBufProvider<SIZE>::init(std::vector<CBin<SIZE>>& bins)
+{
+ uint64 start, end;
+ uint64 needed_mem = 0;
+ rec_size = bins.front().get_record_size();
+ max_bin_bytes = SINGLE_BIN_BUFF_SIZE_FOR_DB2_READER / rec_size * rec_size;
+ uint32 mem;
+
+
+ internal_bufs.resize(bins.size());
+ for (uint32 i = 0; i < bins.size(); ++i)
+ {
+ auto& b = bins[i];
+ start = b.get_kmer_number();
+ end = b.get_kmer_number_end();
+ mem = (uint32)MIN((end - start) * rec_size, max_bin_bytes);
+
+ internal_bufs[i] = CBinBuff(nullptr, mem);
+ desc.push_back(std::make_tuple(start, end, true));
+ needed_mem += mem;
+ }
+
+ bins_left_to_read = (uint32)bins.size();
+ buf_bins = new uchar[needed_mem];
+ buf_internal = new uchar[needed_mem];
+
+
+ internal_bufs[0].buf = buf_internal;
+
+ uchar* ptr = buf_bins;
+ bins[0].set_bin_buff(CBinBuff(ptr, internal_bufs[0].size));
+
+ for (uint32 i = 1; i < internal_bufs.size(); ++i)
+ {
+ internal_bufs[i].buf = internal_bufs[i - 1].buf + internal_bufs[i - 1].size;
+ ptr += internal_bufs[i - 1].size;
+ bins[i].set_bin_buff(CBinBuff(ptr, internal_bufs[i].size));
+ }
+
+ for (uint32 bin_id = 0; bin_id < desc.size(); ++bin_id)
+ {
+ uint64 kmers_left = std::get<1>(desc[bin_id]) - std::get<0>(desc[bin_id]);
+ if (kmers_left)
+ {
+ uint32 kmers_to_read = (uint32)MIN(kmers_left, max_bin_bytes / rec_size);
+ internal_bufs[bin_id].size = kmers_to_read * rec_size;
+ to_read.push(std::make_tuple(bin_id, 4 + std::get<0>(desc[bin_id]) * rec_size, internal_bufs[bin_id].buf, internal_bufs[bin_id].size));
+ std::get<0>(desc[bin_id]) += kmers_to_read;
+ }
+ else
+ {
+ --bins_left_to_read;
+ }
+ }
+
+}
+
+//************************************************************************************************************
+// CKmerPQ - Priority Queue of k-mers - binary heap. K-mers from bins are processed by this priority queue
+//************************************************************************************************************
+template<unsigned SIZE> class CKmerPQ
+{
+public:
+ CKmerPQ(uint32 _no_of_bins);
+ inline void init_add(CBin<SIZE>* bin);
+ inline bool get_min(CBundleData<SIZE>& bundle_data);
+
+ inline void reset();
+
+private:
+ inline void update_heap();
+
+ using elem_t = std::pair<CKmer<SIZE>, uint32>;//kmer, desc_id
+ using desc_t = std::pair<CBin<SIZE>*, uint32>;//bin, counter
+ std::vector<elem_t> elems;
+ std::vector<desc_t> descs;
+ uint32 pos, desc_pos;
+};
+
+
+//************************************************************************************************************
+// CMergerParent - Merger of k-mers produced by CMergerChilds
+//************************************************************************************************************
+template<unsigned SIZE> class CMergerParent
+{
+public:
+ CMergerParent(std::vector<CCircularQueue<SIZE>*>& input_queues, CCircularQueue<SIZE>& output_queue);
+
+ void operator()();
+
+private:
+ std::vector<CBundleData<SIZE>> input_bundles;
+ std::vector<CCircularQueue<SIZE>*>& input_queues;
+ CBundleData<SIZE> output_bundle;
+ CCircularQueue<SIZE>& output_queue;
+};
+
+//************************************************************************************************************
+// CMergerChild - Merger of k-mers from bins
+//************************************************************************************************************
+template<unsigned SIZE> class CMergerChild
+{
+ using bin_iter = typename std::vector<CBin<SIZE>>::iterator;
+public:
+ CMergerChild(bin_iter begin, bin_iter end, CCircularQueue<SIZE>& output_queue);
+
+ void operator()();
+
+private:
+ std::vector<std::reference_wrapper<CBin<SIZE>>> bins;
+ CCircularQueue<SIZE>& output_queue;
+};
+
+//************************************************************************************************************
+// CKMC2DbReaderSorted - Produce k-mers in sorted order from KMC2 database
+//************************************************************************************************************
+template<unsigned SIZE> class CKMC2DbReaderSorted
+{
+public:
+ CKMC2DbReaderSorted(const CKMC_header& header, const CInputDesc& desc);
+
+ void NextBundle(CBundle<SIZE>& bundle, bool& finished);
+
+ void IgnoreRest();
+
+ ~CKMC2DbReaderSorted();
+
+private:
+ //void get_suf_buf_part(uchar* &buf, uint64 start, uint32 size);
+
+ const CKMC_header& header;
+ const CInputDesc& desc;
+ uint64* LUTS = nullptr;
+ uint32 lut_size = 0;
+ uint32 suffix_bytes;
+ uint32 record_size;
+ FILE* kmc_suf;
+
+ friend class CBin<SIZE>;
+ std::vector<CBin<SIZE>> bins;
+ CBinBufProvider<SIZE> bin_provider;
+
+ CSufBinReader<SIZE>* suf_bin_reader;
+ std::thread suf_bin_reader_th;
+
+
+ uint32 n_child_threads;
+
+ CMergerParent<SIZE>* parent;
+ std::thread parent_thread;
+
+ CCircularQueue<SIZE> output_queue;
+ std::vector<CCircularQueue<SIZE>*> childs_parent_queues;
+
+ std::vector<CMergerChild<SIZE>*> childs;
+ std::vector<std::thread> childs_threads;
+
+ //mutable std::mutex mtx;
+};
+
+//************************************************************************************************************
+// CKCM2DbReaderSeqCounter_Base - Base class for classes to access k-mers one by one (not sorted) or
+// for counters only from KMC2 database
+//************************************************************************************************************
+template <unsigned SIZE> class CKCM2DbReaderSeqCounter_Base
+{
+protected:
+ CKCM2DbReaderSeqCounter_Base(const CKMC_header& header, const CInputDesc& desc);
+ ~CKCM2DbReaderSeqCounter_Base();
+
+ void open_files();
+ bool reload_suf_buff();
+
+ static const uint32 PREFIX_BUFF_BYTES = KMC2_DB_READER_PREFIX_BUFF_BYTES;
+ static const uint32 SUFIX_BUFF_BYTES = KMC2_DB_READER_SUFIX_BUFF_BYTES;
+
+ const CKMC_header& header;
+ const CInputDesc& desc;
+
+ uint32 sufix_bytes;
+ uint32 record_size; //of sufix, in bytes
+ uint64 sufix_buff_size, sufix_buff_pos, sufix_left_to_read;
+ uint64 prefix_buff_size, prefix_buff_pos, prefix_left_to_read;
+ uint64 sufix_number;
+
+ uint32 kmer_bytes, prefix_bytes;
+
+ uchar* sufix_buff = nullptr;
+
+ FILE* sufix_file;
+ std::string sufix_file_name;
+};
+
+//************************************************************************************************************
+// CKMC2DbReaderSequential - Produce k-mers sequentialy from KMC2 database (they are not sorted!)
+//************************************************************************************************************
+template<unsigned SIZE> class CKMC2DbReaderSequential : public CKCM2DbReaderSeqCounter_Base<SIZE>
+{
+public:
+ CKMC2DbReaderSequential(const CKMC_header& header, const CInputDesc& desc);
+ bool NextKmerSequential(CKmer<SIZE>& kmer, uint32& counter);
+ ~CKMC2DbReaderSequential();
+
+private:
+ void allocate_buffers();
+ void reload_pref_buff();
+
+ uint32 signle_bin_size, map_size, map_size_bytes, no_of_bins;
+ std::string prefix_file_name;
+ FILE* prefix_file;
+ uint64 current_prefix_index;
+ uint64 prefix_mask;
+
+ uint64* prefix_buff = nullptr;
+};
+
+//************************************************************************************************************
+// CKMC2DbReaderCountersOnly - Produce counters of k-mers from KMC2 database
+//************************************************************************************************************
+template<unsigned SIZE> class CKMC2DbReaderCountersOnly : CKCM2DbReaderSeqCounter_Base<SIZE>
+{
+public:
+ CKMC2DbReaderCountersOnly(const CKMC_header& header, const CInputDesc& desc);
+ bool NextCounter(uint32& counter);
+
+private:
+ void allocate_buffers();
+};
+
+//************************************************************************************************************
+// CKMC2DbReader - reader of KMC2
+//************************************************************************************************************
+template<unsigned SIZE> class CKMC2DbReader : public CInput<SIZE>
+{
+public:
+ CKMC2DbReader(const CKMC_header& header, const CInputDesc& desc, CPercentProgress& percent_progress, KMCDBOpenMode open_mode);
+
+ void NextBundle(CBundle<SIZE>& bundle) override;
+
+ void IgnoreRest() override;
+
+ bool NextKmerSequential(CKmer<SIZE>& kmer, uint32& counter);
+ bool NextCounter(uint32& counter);
+
+private:
+ CPercentProgress& percent_progress;
+ uint32 progress_id;
+
+ std::unique_ptr<CKMC2DbReaderSorted<SIZE>> db_reader_sorted;
+ std::unique_ptr<CKMC2DbReaderSequential<SIZE>> db_reader_sequential;
+ std::unique_ptr<CKMC2DbReaderCountersOnly<SIZE>> db_reader_counters_only;
+};
+
+
+
+/*****************************************************************************************************************************/
+/**************************************************** CBin IMPLEMENTATION ****************************************************/
+/*****************************************************************************************************************************/
+
+/*****************************************************************************************************************************/
+/******************************************************** CONSTRUCTOR ********************************************************/
+/*****************************************************************************************************************************/
+
+template<unsigned SIZE> CBin<SIZE>::CBin(uint32 bin_id, uint64* LUT, CKMC2DbReaderSorted<SIZE>& kmc2_db) :
+ bin_id(bin_id),
+ LUT(LUT),
+ bin_provider(kmc2_db.bin_provider),
+ kmc2_db(kmc2_db),
+ suffix_bytes(kmc2_db.suffix_bytes),
+ counter_size(kmc2_db.header.counter_size),
+ max_prefix(kmc2_db.lut_size - 1)
+
+{
+ kmer_number = LUT[0];
+ kmer_number_end = LUT[kmc2_db.lut_size];
+ prefix_bytes = (kmc2_db.header.lut_prefix_len + 3) / 4;
+ kmer_bytes = prefix_bytes + suffix_bytes;
+
+ record_size = suffix_bytes + counter_size;
+}
+
+
+/*****************************************************************************************************************************/
+/********************************************************** PUBLIC ***********************************************************/
+/*****************************************************************************************************************************/
+
+/*****************************************************************************************************************************/
+
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> bool CBin<SIZE>::NextKmer(CKmer<SIZE>& kmer, uint32& counter)
+{
+ while (true)
+ {
+ if (kmer_number >= kmer_number_end)
+ return false;
+
+ if (pos >= bin_buff.size)
+ reload_suf_buf();
+
+ //skip empty
+ while (LUT[prefix + 1] <= kmer_number)
+ {
+ ++prefix;
+ }
+
+ uint32 in_kmer_pos = kmer_bytes - 1;
+ uchar* record = bin_buff.buf + pos;
+ kmer.load(record, suffix_bytes);
+ for (int32 i = prefix_bytes - 1; i >= 0; --i)
+ kmer.set_byte(in_kmer_pos--, uchar(prefix >> (i << 3)));
+
+ counter = 0;
+ for (int32 i = counter_size - 1; i >= 0; --i)
+ {
+ counter <<= 8;
+ counter += record[i];
+ }
+
+ ++kmer_number;
+ pos += record_size;
+
+ if (counter >= kmc2_db.desc.cutoff_min && counter <= kmc2_db.desc.cutoff_max)
+ return true;
+ }
+ return true;
+}
+/*****************************************************************************************************************************/
+/********************************************************** PRIVATE **********************************************************/
+/*****************************************************************************************************************************/
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> void CBin<SIZE>::reload_suf_buf()
+{
+ bin_provider.pop(bin_id, bin_buff);
+ pos = 0;
+}
+
+
+
+/*****************************************************************************************************************************/
+/************************************************** CKmerPQ IMPLEMENTATION ***************************************************/
+/*****************************************************************************************************************************/
+
+/*****************************************************************************************************************************/
+/******************************************************** CONSTRUCTOR ********************************************************/
+/*****************************************************************************************************************************/
+
+template<unsigned SIZE> CKmerPQ<SIZE>::CKmerPQ(uint32 _no_of_bins)
+{
+ elems.resize(_no_of_bins + 1);
+ descs.resize(_no_of_bins + 1);
+ pos = 1;
+ desc_pos = 0;
+}
+
+/*****************************************************************************************************************************/
+/********************************************************** PUBLIC ***********************************************************/
+/*****************************************************************************************************************************/
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> void CKmerPQ<SIZE>::reset()
+{
+ pos = 1;
+ desc_pos = 0;
+}
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> inline bool CKmerPQ<SIZE>::get_min(CBundleData<SIZE>& bundle_data)
+{
+ if (pos <= 1)
+ return false;
+ bundle_data.Insert(elems[1].first, descs[elems[1].second].second);
+
+ update_heap();
+ return true;
+}
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> inline void CKmerPQ<SIZE>::init_add(CBin<SIZE>* bin)
+{
+ CKmer<SIZE> kmer;
+ uint32 counter;
+ if (bin->NextKmer(kmer, counter))
+ {
+ descs[desc_pos] = std::make_pair(bin, counter);
+ elems[pos] = std::make_pair(kmer, desc_pos);
+ uint32 child_pos = pos++;
+
+ while (child_pos > 1 && elems[child_pos].first < elems[child_pos / 2].first)
+ {
+ swap(elems[child_pos], elems[child_pos / 2]);
+ child_pos /= 2;
+ }
+
+ ++desc_pos;
+ }
+}
+
+/*****************************************************************************************************************************/
+/********************************************************** PRIVATE **********************************************************/
+/*****************************************************************************************************************************/
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> inline void CKmerPQ<SIZE>::update_heap()
+{
+ uint32 desc_id = elems[1].second;
+ CBin<SIZE>* bin = descs[desc_id].first;
+ CKmer<SIZE> kmer;
+ uint32 counter;
+ if (!bin->NextKmer(kmer, counter))
+ {
+ kmer.set(elems[--pos].first);
+ desc_id = elems[pos].second;
+ }
+ else
+ descs[desc_id].second = counter;
+
+ uint32 parent, less;
+ parent = less = 1;
+ while (true)
+ {
+ if (parent * 2 >= pos)
+ break;
+ if (parent * 2 + 1 >= pos)
+ less = parent * 2;
+ else if (elems[parent * 2].first < elems[parent * 2 + 1].first)
+ less = parent * 2;
+ else
+ less = parent * 2 + 1;
+ if (elems[less].first < kmer)
+ {
+ elems[parent] = elems[less];
+ parent = less;
+ }
+ else
+ break;
+ }
+ elems[parent] = std::make_pair(kmer, desc_id);
+}
+
+
+
+/*****************************************************************************************************************************/
+/*********************************************** CMergerParent IMPLEMENTATION ************************************************/
+/*****************************************************************************************************************************/
+
+/*****************************************************************************************************************************/
+/******************************************************** CONSTRUCTOR ********************************************************/
+/*****************************************************************************************************************************/
+template<unsigned SIZE> CMergerParent<SIZE>::CMergerParent(std::vector<CCircularQueue<SIZE>*>& input_queues, CCircularQueue<SIZE>& output_queue) :
+ input_queues(input_queues),
+ output_queue(output_queue)
+{
+ input_bundles.resize(input_queues.size());
+}
+
+
+/*****************************************************************************************************************************/
+/********************************************************** PUBLIC ***********************************************************/
+/*****************************************************************************************************************************/
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> void CMergerParent<SIZE>::operator()()
+{
+ //init
+ //for (uint32 i = 0; i < input_queues.size(); ++i)
+ auto q_iter = input_queues.begin();
+ auto b_iter = input_bundles.begin();
+ for (; q_iter != input_queues.end();)
+ {
+ if (!(*q_iter)->pop(*b_iter))
+ {
+ q_iter = input_queues.erase(q_iter);
+ b_iter = input_bundles.erase(b_iter);
+ }
+ else
+ ++q_iter, ++b_iter;
+ }
+
+ //run
+ uint32 index_of_min = 0;
+ while (input_bundles.size())
+ {
+ index_of_min = 0;
+ for (uint32 i = 1; i < input_bundles.size(); ++i)
+ {
+ if (input_bundles[i].TopKmer() < input_bundles[index_of_min].TopKmer())
+ index_of_min = i;
+ }
+
+ output_bundle.Insert(input_bundles[index_of_min].TopKmer(), input_bundles[index_of_min].TopCounter());
+ input_bundles[index_of_min].Pop();
+ if (input_bundles[index_of_min].Empty())
+ {
+ if (!input_queues[index_of_min]->pop(input_bundles[index_of_min]))
+ {
+ input_queues.erase(input_queues.begin() + index_of_min);
+ input_bundles.erase(input_bundles.begin() + index_of_min);
+ }
+ }
+
+
+ if (output_bundle.Full())
+ {
+ if (!output_queue.push(output_bundle))
+ break;
+ }
+ }
+ if (!output_bundle.Empty())
+ output_queue.push(output_bundle);
+ output_queue.mark_completed();
+}
+
+
+/*****************************************************************************************************************************/
+/************************************************ CMergerChild IMPLEMENTATION ************************************************/
+/*****************************************************************************************************************************/
+
+/*****************************************************************************************************************************/
+/******************************************************** CONSTRUCTOR ********************************************************/
+/*****************************************************************************************************************************/
+template<unsigned SIZE> CMergerChild<SIZE>::CMergerChild(bin_iter begin, bin_iter end, CCircularQueue<SIZE>& output_queue) :
+ bins(begin, end),
+ output_queue(output_queue)
+{
+
+}
+
+/*****************************************************************************************************************************/
+/********************************************************** PUBLIC ***********************************************************/
+/*****************************************************************************************************************************/
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> void CMergerChild<SIZE>::operator()()
+{
+ CKmerPQ<SIZE> kmers_pq(static_cast<uint32>(bins.size()));
+ for (uint32 i = 0; i < bins.size(); ++i)
+ kmers_pq.init_add(&bins[i].get());
+
+ CBundleData<SIZE> bundle_data;
+ while (kmers_pq.get_min(bundle_data))
+ {
+ if (bundle_data.Full())
+ {
+ if (!output_queue.push(bundle_data))
+ break;
+ }
+ }
+ if (!bundle_data.Empty())
+ output_queue.push(bundle_data);
+ output_queue.mark_completed();
+}
+
+
+
+/*****************************************************************************************************************************/
+/********************************************* CKMC2DbReaderSorted IMPLEMENTATION ********************************************/
+/*****************************************************************************************************************************/
+
+/*****************************************************************************************************************************/
+/******************************************************** CONSTRUCTOR ********************************************************/
+/*****************************************************************************************************************************/
+template<unsigned SIZE> CKMC2DbReaderSorted<SIZE>::CKMC2DbReaderSorted(const CKMC_header& header, const CInputDesc& desc) :
+ header(header),
+ desc(desc),
+ output_queue(DEFAULT_CIRCULAL_QUEUE_CAPACITY)
+{
+ LUTS = nullptr;
+ lut_size = 1 << 2 * header.lut_prefix_len;
+ uint32 lut_recs = (1 << 2 * header.lut_prefix_len) * header.no_of_bins + 1;
+ LUTS = new uint64[lut_recs];
+ suffix_bytes = (header.kmer_len - header.lut_prefix_len) / 4;
+ record_size = suffix_bytes + header.counter_size;
+ if (!LUTS)
+ {
+ std::cout << "cannot allocate memory for LUTS of KMC2 database\n";
+ exit(1);
+ }
+
+ std::string kmc_pre_file_name = desc.file_src + ".kmc_pre";
+ FILE* kmc_pre = fopen(kmc_pre_file_name.c_str(), "rb");
+ if (!kmc_pre)
+ {
+ std::cout << "Cannot open kmc2 prefix file to read LUTS";
+ exit(1);
+ }
+
+ my_fseek(kmc_pre, 4, SEEK_SET);
+ if (fread(LUTS, sizeof(uint64), lut_recs, kmc_pre) != lut_recs)
+ {
+ std::cout << "Some error occured while reading LUTS from kmc2 prefix file \n";
+ exit(1);
+ }
+ fclose(kmc_pre);
+
+ std::string kmc_suf_file_name = desc.file_src + ".kmc_suf";
+ kmc_suf = fopen(kmc_suf_file_name.c_str(), "rb");
+
+ if (!kmc_suf)
+ {
+ std::cout << "Cannot open kmc2 suffix file\n";
+ exit(1);
+ }
+ setvbuf(kmc_suf, NULL, _IONBF, 0);
+
+ bins.reserve(header.no_of_bins);
+ for (uint32 i = 0; i < header.no_of_bins; ++i)
+ bins.emplace_back(i, LUTS + i * lut_size, *this);
+
+ //starting threads
+
+ bin_provider.init(bins);
+
+ suf_bin_reader = new CSufBinReader<SIZE>(bin_provider, kmc_suf);
+ suf_bin_reader_th = std::thread(std::ref(*suf_bin_reader));
+
+ n_child_threads = desc.threads;
+
+ childs_parent_queues.reserve(n_child_threads);
+
+ for (uint32 i = 0; i < n_child_threads; ++i)
+ childs_parent_queues.push_back(new CCircularQueue<SIZE>(DEFAULT_CIRCULAL_QUEUE_CAPACITY));
+
+
+ uint32 bins_per_thread = header.no_of_bins / n_child_threads;
+
+ for (uint32 i = 0; i < n_child_threads - 1; ++i)
+ {
+ childs.push_back(new CMergerChild<SIZE>(bins.begin() + i * bins_per_thread, bins.begin() + (i + 1) * bins_per_thread, *childs_parent_queues[i]));
+ childs_threads.push_back(std::thread(std::ref(*childs.back())));
+ }
+
+ //last one
+ childs.push_back(new CMergerChild<SIZE>(bins.begin() + (n_child_threads - 1) * bins_per_thread, bins.end(), *childs_parent_queues.back()));
+ childs_threads.push_back(std::thread(std::ref(*childs.back())));
+
+ parent = new CMergerParent<SIZE>(childs_parent_queues, output_queue);
+ parent_thread = std::thread(std::ref(*parent));
+
+
+
+
+}
+
+/*****************************************************************************************************************************/
+/********************************************************** PUBLIC ***********************************************************/
+/*****************************************************************************************************************************/
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> void CKMC2DbReaderSorted<SIZE>::NextBundle(CBundle<SIZE>& bundle, bool& finished)
+{
+ if (output_queue.pop(bundle.Data()))
+ {
+ return;
+ }
+
+ for (auto& child_thread : childs_threads)
+ child_thread.join();
+ for (auto& child : childs)
+ delete child;
+
+ parent_thread.join();
+ delete parent;
+
+ for (auto& q : childs_parent_queues)
+ delete q;
+
+ suf_bin_reader_th.join();
+ delete suf_bin_reader;
+}
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> void CKMC2DbReaderSorted<SIZE>::IgnoreRest()
+{
+ output_queue.force_finish();
+
+ for (auto& q : childs_parent_queues)
+ q->force_finish();
+
+ for (auto& child_thread : childs_threads)
+ child_thread.join();
+ for (auto& child : childs)
+ delete child;
+
+ parent_thread.join();
+ delete parent;
+
+ for (auto& q : childs_parent_queues)
+ delete q;
+
+ bin_provider.force_to_finish();
+
+ suf_bin_reader_th.join();
+ delete suf_bin_reader;
+}
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> CKMC2DbReaderSorted<SIZE>::~CKMC2DbReaderSorted()
+{
+ delete[] LUTS;
+ fclose(kmc_suf);
+}
+
+/*****************************************************************************************************************************/
+/********************************************************** PRIVATE **********************************************************/
+/*****************************************************************************************************************************/
+
+/*****************************************************************************************************************************/
+//template<unsigned SIZE> void CKMC2DbReaderSorted<SIZE>::get_suf_buf_part(uchar* &buf, uint64 start, uint32 size)
+//{
+//#ifdef ENABLE_LOGGER
+// CTimer timer;
+// timer.start();
+//#endif
+// std::unique_lock<std::mutex> lck(mtx);
+//#ifdef ENABLE_LOGGER
+// CLoger::GetLogger().log_operation("waiting for lock", this, timer.get_time());
+// timer.start();
+//#endif
+// start = 4 + start * record_size;
+// size *= record_size;
+//
+//
+// my_fseek(kmc_suf, start, SEEK_SET);
+// if (fread(buf, 1, size, kmc_suf) != size)
+// {
+// std::cout << "Error: some error occured while reading " << desc.file_src << ".kmc_suf file\n";
+// exit(1);
+// }
+//#ifdef ENABLE_LOGGER
+// CLoger::GetLogger().log_operation("fread time", this, timer.get_time());
+//#endif
+//}
+
+
+/*****************************************************************************************************************************/
+/******************************************* CKCM2DbReaderSeqCounter_Base IMPLEMENTATION *************************************/
+/*****************************************************************************************************************************/
+
+/*****************************************************************************************************************************/
+/******************************************************** CONSTRUCTOR ********************************************************/
+/*****************************************************************************************************************************/
+template<unsigned SIZE> CKCM2DbReaderSeqCounter_Base<SIZE>::CKCM2DbReaderSeqCounter_Base(const CKMC_header& header, const CInputDesc& desc) :
+ header(header),
+ desc(desc)
+{
+ sufix_bytes = (header.kmer_len - header.lut_prefix_len) / 4;
+ record_size = sufix_bytes + header.counter_size;
+ sufix_buff_size = SUFIX_BUFF_BYTES / record_size * record_size;
+
+ sufix_left_to_read = header.total_kmers * record_size;
+
+ if (sufix_left_to_read < sufix_buff_size)
+ sufix_buff_size = sufix_left_to_read;
+
+ prefix_bytes = (header.lut_prefix_len + 3) / 4;
+
+ kmer_bytes = prefix_bytes + sufix_bytes;
+
+}
+
+/*****************************************************************************************************************************/
+/********************************************************* PROTECTED**********************************************************/
+/*****************************************************************************************************************************/
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> CKCM2DbReaderSeqCounter_Base<SIZE>::~CKCM2DbReaderSeqCounter_Base()
+{
+ if (sufix_file)
+ fclose(sufix_file);
+ delete[] sufix_buff;
+}
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> void CKCM2DbReaderSeqCounter_Base<SIZE>::open_files()
+{
+ sufix_file_name = desc.file_src + ".kmc_suf";
+
+ sufix_file = fopen(sufix_file_name.c_str(), "rb");
+
+
+ if (!sufix_file)
+ {
+ std::cout << "Error: cannot open file: " << sufix_file_name << "\n";
+ exit(1);
+ }
+ setvbuf(sufix_file, NULL, _IONBF, 0);
+
+ char marker[4];
+ if (fread(marker, 1, 4, sufix_file) != 4)
+ {
+ std::cout << "Error: while reading start marker in file: " << sufix_file_name << "\n";
+ exit(1);
+ }
+
+ if (strncmp(marker, "KMCS", 4) != 0)
+ {
+ std::cout << "Error: wrong start marker in file: " << sufix_file_name << "\n";
+ exit(1);
+ }
+
+
+ my_fseek(sufix_file, -4, SEEK_END);
+ if (fread(marker, 1, 4, sufix_file) != 4)
+ {
+ std::cout << "Error: while reading end marker in file: " << sufix_file_name << "\n";
+ exit(1);
+ }
+
+ if (strncmp(marker, "KMCS", 4) != 0)
+ {
+ std::cout << "Error: wrong end marker in file: " << sufix_file_name << "\n";
+ exit(1);
+ }
+
+ my_fseek(sufix_file, 4, SEEK_SET); //skip KMCS
+}
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> bool CKCM2DbReaderSeqCounter_Base<SIZE>::reload_suf_buff()
+{
+ uint64 to_read = MIN(sufix_left_to_read, sufix_buff_size);
+ if (to_read == 0)
+ return false;
+ uint64 readed = fread(sufix_buff, 1, to_read, sufix_file);
+ if (readed != to_read)
+ {
+ std::cout << "Error: some error while reading " << sufix_file_name << "\n";
+ exit(1);
+ }
+ sufix_buff_pos = 0;
+ sufix_left_to_read -= to_read;
+ return true;
+}
+
+
+/*****************************************************************************************************************************/
+/********************************************* CKMC2DbReaderSequential IMPLEMENTATION ****************************************/
+/*****************************************************************************************************************************/
+
+/*****************************************************************************************************************************/
+/******************************************************** CONSTRUCTOR ********************************************************/
+/*****************************************************************************************************************************/
+template<unsigned SIZE> CKMC2DbReaderSequential<SIZE>::CKMC2DbReaderSequential(const CKMC_header& header, const CInputDesc& desc) :
+ CKCM2DbReaderSeqCounter_Base<SIZE>(header, desc)
+{
+ this->open_files();
+
+ prefix_file_name = desc.file_src + ".kmc_pre";
+ prefix_file = fopen(prefix_file_name.c_str(), "rb");
+
+ if (!prefix_file)
+ {
+ std::cout << "Error: cannot open file: " << prefix_file_name << "\n";
+ exit(1);
+ }
+ setvbuf(prefix_file, NULL, _IONBF, 0);
+ my_fseek(prefix_file, 4 + sizeof(uint64), SEEK_SET);//skip KMCP and first value as it must be 0
+
+ signle_bin_size = 1 << 2 * header.lut_prefix_len;
+ map_size = (1 << 2 * header.signature_len) + 1;
+
+ map_size_bytes = map_size * sizeof(uint32);
+
+ no_of_bins = header.no_of_bins;
+
+ this->prefix_buff_size = this->PREFIX_BUFF_BYTES / sizeof(uint64);
+
+ this->sufix_left_to_read = this->header.total_kmers * this->record_size;
+
+ if (this->sufix_left_to_read < this->sufix_buff_size)
+ this->sufix_buff_size = this->sufix_left_to_read;
+
+ this->prefix_left_to_read = (1 << this->header.lut_prefix_len * 2) * this->no_of_bins;
+
+ if (this->prefix_left_to_read < this->prefix_buff_size)
+ this->prefix_buff_size = this->prefix_left_to_read;
+
+ prefix_mask = (1 << 2 * this->header.lut_prefix_len) - 1;
+
+ allocate_buffers();
+
+ my_fseek(prefix_file, 4 + sizeof(uint64), SEEK_SET);
+ reload_pref_buff();
+
+ this->reload_suf_buff();
+ current_prefix_index = 0;
+ this->sufix_number = 0;
+ }
+
+/*****************************************************************************************************************************/
+/********************************************************** PUBLIC ***********************************************************/
+/*****************************************************************************************************************************/
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> bool CKMC2DbReaderSequential<SIZE>::NextKmerSequential(CKmer<SIZE>& kmer, uint32& counter)
+{
+ while (true)
+ {
+ if (this->sufix_number >= this->header.total_kmers)
+ return false;
+
+ while (this->prefix_buff[this->prefix_buff_pos] <= this->sufix_number)
+ {
+ ++current_prefix_index;
+ ++this->prefix_buff_pos;
+ if (this->prefix_buff_pos >= this->prefix_buff_size)
+ this->reload_pref_buff();
+ }
+
+ uchar* record = this->sufix_buff + this->sufix_buff_pos;
+ uint32 pos = this->kmer_bytes - 1;
+
+ uint32 current_prefix = static_cast<uint32>(current_prefix_index & prefix_mask);
+
+ kmer.load(record, this->sufix_bytes);
+ for (int32 i = this->prefix_bytes - 1; i >= 0; --i)
+ kmer.set_byte(pos--, current_prefix >> (i << 3));
+
+ counter = 0;
+ for (int32 i = this->header.counter_size - 1; i >= 0; --i)
+ {
+ counter <<= 8;
+ counter += record[i];
+ }
+
+ ++this->sufix_number;
+ this->sufix_buff_pos += this->record_size;
+
+ if (this->sufix_buff_pos >= this->sufix_buff_size)
+ this->reload_suf_buff();
+
+ if (counter >= this->desc.cutoff_min && counter <= this->desc.cutoff_max)
+ return true;
+ }
+}
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> CKMC2DbReaderSequential<SIZE>::~CKMC2DbReaderSequential()
+{
+ if (prefix_file)
+ fclose(prefix_file);
+ delete[] prefix_buff;
+}
+
+/*****************************************************************************************************************************/
+/********************************************************** PRIVATE **********************************************************/
+/*****************************************************************************************************************************/
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> void CKMC2DbReaderSequential<SIZE>::allocate_buffers()
+{
+ this->sufix_buff = new uchar[this->sufix_buff_size];
+ this->prefix_buff = new uint64[this->prefix_buff_size];
+}
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> void CKMC2DbReaderSequential<SIZE>::reload_pref_buff()
+{
+ uint64 to_read = MIN(this->prefix_left_to_read, this->prefix_buff_size);
+ this->prefix_buff_pos = 0;
+ if (fread(prefix_buff, sizeof(uint64), to_read, prefix_file) != to_read)
+ {
+ std::cout << "Error: some error while reading " << prefix_file_name << "\n";
+ exit(1);
+ }
+ this->prefix_left_to_read -= to_read;
+ if (to_read < this->prefix_buff_size)
+ {
+ this->prefix_buff[to_read] = this->header.total_kmers;//guard
+ }
+}
+
+
+/*****************************************************************************************************************************/
+/******************************************** CKMC2DbReaderCountersOnly IMPLEMENTATION ***************************************/
+/*****************************************************************************************************************************/
+
+/*****************************************************************************************************************************/
+/******************************************************** CONSTRUCTOR ********************************************************/
+/*****************************************************************************************************************************/
+template<unsigned SIZE> CKMC2DbReaderCountersOnly<SIZE>::CKMC2DbReaderCountersOnly(const CKMC_header& header, const CInputDesc& desc) :
+ CKCM2DbReaderSeqCounter_Base<SIZE>(header, desc)
+{
+ this->open_files();
+ allocate_buffers();
+ this->reload_suf_buff();
+ this->sufix_number = 0;
+ }
+
+/*****************************************************************************************************************************/
+/********************************************************** PUBLIC ***********************************************************/
+/*****************************************************************************************************************************/
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> bool CKMC2DbReaderCountersOnly<SIZE>::NextCounter(uint32& counter)
+{
+ while (true)
+ {
+ if (this->sufix_number >= this->header.total_kmers)
+ return false;
+
+ uchar* record = this->sufix_buff + this->sufix_buff_pos + this->sufix_bytes;
+
+ counter = 0;
+ for (int32 i = this->header.counter_size - 1; i >= 0; --i)
+ {
+ counter <<= 8;
+ counter += record[i];
+ }
+
+ ++this->sufix_number;
+ this->sufix_buff_pos += this->record_size;
+
+ if (this->sufix_buff_pos >= this->sufix_buff_size)
+ this->reload_suf_buff();
+
+ if (counter >= this->desc.cutoff_min && counter <= this->desc.cutoff_max)
+ return true;
+ }
+}
+
+/*****************************************************************************************************************************/
+/********************************************************** PRIVATE **********************************************************/
+/*****************************************************************************************************************************/
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> void CKMC2DbReaderCountersOnly<SIZE>::allocate_buffers()
+{
+ this->sufix_buff = new uchar[this->sufix_buff_size];
+}
+
+
+
+/*****************************************************************************************************************************/
+/************************************************* CKMC2DbReader IMPLEMENTATION **********************************************/
+/*****************************************************************************************************************************/
+
+/*****************************************************************************************************************************/
+/******************************************************** CONSTRUCTOR ********************************************************/
+/*****************************************************************************************************************************/
+template<unsigned SIZE> CKMC2DbReader<SIZE>::CKMC2DbReader(const CKMC_header& header, const CInputDesc& desc, CPercentProgress& percent_progress, KMCDBOpenMode open_mode) :
+ percent_progress(percent_progress)
+{
+ progress_id = percent_progress.RegisterItem(header.total_kmers);
+ switch (open_mode)
+ {
+ case KMCDBOpenMode::sorted:
+ db_reader_sorted = std::make_unique <CKMC2DbReaderSorted<SIZE>>(header, desc);
+ break;
+ case KMCDBOpenMode::sequential:
+ db_reader_sequential = std::make_unique<CKMC2DbReaderSequential<SIZE>>(header, desc);
+ break;
+ case KMCDBOpenMode::counters_only:
+ db_reader_counters_only = std::make_unique<CKMC2DbReaderCountersOnly<SIZE>>(header, desc);
+ break;
+ default: //should never be here
+ std::cout << "Error: unknow open mode \n";
+ exit(1);
+ }
+}
+
+/*****************************************************************************************************************************/
+/********************************************************** PUBLIC ***********************************************************/
+/*****************************************************************************************************************************/
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> void CKMC2DbReader<SIZE>::NextBundle(CBundle<SIZE>& bundle)
+{
+#ifdef ENABLE_LOGGER
+ CTimer timer;
+ timer.start();
+#endif
+ db_reader_sorted->NextBundle(bundle, this->finished);
+ percent_progress.UpdateItem(progress_id, bundle.Size());
+ if(this->finished)
+ {
+ percent_progress.Complete(progress_id);
+ }
+#ifdef ENABLE_LOGGER
+ CLoger::GetLogger().log_operation("pobranie bundla z wejscia", this, timer.get_time());
+#endif
+}
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> void CKMC2DbReader<SIZE>::IgnoreRest()
+{
+ db_reader_sorted->IgnoreRest();
+}
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> bool CKMC2DbReader<SIZE>::NextKmerSequential(CKmer<SIZE>& kmer, uint32& counter)
+{
+ if (db_reader_sequential->NextKmerSequential(kmer, counter))
+ {
+ percent_progress.UpdateItem(progress_id);
+ return true;
+ }
+ percent_progress.Complete(progress_id);
+ return false;
+}
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> bool CKMC2DbReader<SIZE>::NextCounter(uint32& counter)
+{
+ if (db_reader_counters_only->NextCounter(counter))
+ {
+ percent_progress.UpdateItem(progress_id);
+ return true;
+ }
+ percent_progress.Complete(progress_id);
+ return false;
+}
+
+
+
+#endif
\ No newline at end of file
diff --git a/kmc_tools/kmc_header.cpp b/kmc_tools/kmc_header.cpp
new file mode 100644
index 0000000..054ec9c
--- /dev/null
+++ b/kmc_tools/kmc_header.cpp
@@ -0,0 +1,88 @@
+/*
+ This file is a part of KMC software distributed under GNU GPL 3 licence.
+ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+ Authors: Marek Kokot
+
+ Version: 2.3.0
+ Date : 2015-08-21
+*/
+
+#include "stdafx.h"
+#include "kmc_header.h"
+#include <cstring>
+
+/*****************************************************************************************************************************/
+/******************************************************** CONSTRUCTOR ********************************************************/
+/*****************************************************************************************************************************/
+
+CKMC_header::CKMC_header(std::string file_name)
+{
+ file_name += ".kmc_pre";
+ FILE* file = my_fopen(file_name.c_str(), "rb");
+ if (!file)
+ {
+ std::cout << "Error: Cannot open file " << file_name << "\n";
+ exit(1);
+ }
+ char marker[4];
+ if (fread(marker, 1, 4, file) != 4)
+ {
+ std::cout << "Error while reading start marker in " << file_name << "\n";
+ exit(1);
+ }
+
+ if (strncmp(marker, "KMCP", 4) != 0)
+ {
+ std::cout << "Error: wrong start marker in " << file_name << "\n";
+ exit(1);
+ }
+
+ my_fseek(file, -4, SEEK_END);
+ if (fread(marker, 1, 4, file) != 4)
+ {
+ std::cout << "Error while reading end marker in " << file_name << "\n";
+ exit(1);
+ }
+
+ if (strncmp(marker, "KMCP", 4) != 0)
+ {
+ std::cout << "Error: wrong end marker in " << file_name << "\n";
+ exit(1);
+ }
+
+ my_fseek(file, 0, SEEK_END);
+ file_size = my_ftell(file);
+
+ my_fseek(file, -8, SEEK_END);
+ load_uint(file, header_offset);
+
+ my_fseek(file, -12, SEEK_END);
+ load_uint(file, db_version);
+
+ my_fseek(file, 0LL - (header_offset + 8), SEEK_END);
+ load_uint(file, kmer_len);
+ load_uint(file, mode);
+ load_uint(file, counter_size);
+ load_uint(file, lut_prefix_len);
+ if (IsKMC2())
+ load_uint(file, signature_len);
+ load_uint(file, min_count);
+ load_uint(file, max_count);
+ load_uint(file, total_kmers);
+ uchar both_s_tmp;
+ load_uint(file, both_s_tmp);
+ both_strands = both_s_tmp == 1;
+ both_strands = !both_strands;
+
+ fclose(file);
+
+ if (IsKMC2())
+ {
+ uint32 single_lut_size = (1ull << (2 * lut_prefix_len)) * sizeof(uint64);
+ uint32 map_size = ((1 << 2 * signature_len) + 1) * sizeof(uint32);
+ no_of_bins = (uint32)((file_size - sizeof(uint64) - 12 - header_offset - map_size) / single_lut_size);
+ }
+}
+
+// ***** EOF
\ No newline at end of file
diff --git a/kmc_tools/kmc_header.h b/kmc_tools/kmc_header.h
new file mode 100644
index 0000000..58e8edb
--- /dev/null
+++ b/kmc_tools/kmc_header.h
@@ -0,0 +1,55 @@
+/*
+ This file is a part of KMC software distributed under GNU GPL 3 licence.
+ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+ Authors: Marek Kokot
+
+ Version: 2.3.0
+ Date : 2015-08-21
+*/
+
+#ifndef _KMC_HEADER_H
+#define _KMC_HEADER_H
+#include "defs.h"
+#include <string>
+#include <iostream>
+
+//************************************************************************************************************
+// CKMC_header - represents header of KMC database.
+//************************************************************************************************************
+struct CKMC_header
+{
+public:
+ uint32 kmer_len = 0;
+ uint32 mode = 0;
+ uint32 counter_size = 0;
+ uint32 lut_prefix_len = 0;
+ uint32 signature_len = 0; //only for kmc2
+ uint32 min_count = 0;
+ uint32 max_count = 0;
+ uint64 total_kmers = 0;
+ bool both_strands = true;
+ uint32 db_version = 0;
+ uint32 header_offset = 0;
+ uint64 file_size = 0;
+
+ uint32 no_of_bins = 0; //only for kmc2
+ bool IsKMC2()
+ {
+ return db_version == 0x200;
+ }
+ CKMC_header(std::string file_name);
+
+private:
+ template<typename T> void load_uint(FILE* file, T& res)
+ {
+ res = 0;
+ for (uint32 i = 0; i < sizeof(T); ++i)
+ res += (T)getc(file) << (i << 3);
+ }
+};
+
+#endif
+
+
+// ***** EOF
\ No newline at end of file
diff --git a/kmc_tools/kmc_tools.cpp b/kmc_tools/kmc_tools.cpp
new file mode 100644
index 0000000..30601d5
--- /dev/null
+++ b/kmc_tools/kmc_tools.cpp
@@ -0,0 +1,360 @@
+/*
+ This file is a part of KMC software distributed under GNU GPL 3 licence.
+ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+ Authors: Marek Kokot
+
+ Version: 2.3.0
+ Date : 2015-08-21
+*/
+
+#include "stdafx.h"
+#include <iostream>
+#include <vector>
+
+#include "config.h"
+#include "parser.h"
+#include "timer.h"
+#include "kmc1_db_reader.h"
+#include "kmc2_db_reader.h"
+#include "kmc1_db_writer.h"
+#include "parameters_parser.h"
+#include "histogram_writer.h"
+#include "dump_writer.h"
+#include "fastq_reader.h"
+#include "fastq_filter.h"
+#include "fastq_writer.h"
+#ifdef ENABLE_LOGGER
+#include "develop.h"
+#endif
+using namespace std;
+
+template<unsigned SIZE> class CTools
+{
+ CParametersParser& parameters_parser;
+ CConfig& config;
+ bool histo()
+ {
+ if (!config.headers.front().IsKMC2()) //KMC1
+ {
+ CKMC1DbReader<SIZE> kmcdb(config.headers.front(), config.input_desc.front(), CConfig::GetInstance().percent_progress, KMCDBOpenMode::counters_only);
+ CHistogramWriter<CKMC1DbReader<SIZE>> writer(kmcdb);
+ return writer.Process();
+ }
+ else //KMC2
+ {
+ CKMC2DbReader<SIZE> kmcdb(config.headers.front(), config.input_desc.front(), CConfig::GetInstance().percent_progress, KMCDBOpenMode::counters_only);
+ CHistogramWriter<CKMC2DbReader<SIZE>> writer(kmcdb);
+ return writer.Process();
+ }
+ }
+
+ bool dump()
+ {
+ if (!config.headers.front().IsKMC2()) //KMC1 - input is sorted
+ {
+ CKMCDBForDump<CKMC1DbReader<SIZE>, SIZE, true> kmcdb_wrapper;
+ CDumpWriter<decltype(kmcdb_wrapper), SIZE> writer(kmcdb_wrapper);
+ return writer.Process();
+ }
+ else //KMC2
+ {
+ if (config.dump_params.sorted_output)
+ {
+ CKMCDBForDump<CKMC2DbReader<SIZE>, SIZE, true> kmcdb_wrapper;
+ CDumpWriter<decltype(kmcdb_wrapper), SIZE> writer(kmcdb_wrapper);
+ return writer.Process();
+ }
+ else
+ {
+ CKMCDBForDump<CKMC2DbReader<SIZE>, SIZE, false> kmcdb_wrapper;
+ CDumpWriter<decltype(kmcdb_wrapper), SIZE> writer(kmcdb_wrapper);
+ return writer.Process();
+ }
+ }
+ return true;
+ }
+
+ bool filter()
+ {
+ CFilteringParams& filtering_params = config.filtering_params;
+ CFilteringQueues filtering_queues;
+
+ //set parameters and quques
+ int32 avaiable_threads = config.avaiable_threads;
+ filtering_params.n_readers = max(1, avaiable_threads / 2);
+
+ bool gz_bz2 = false;
+ vector<uint64> file_sizes;
+
+ for (auto& p : filtering_params.input_srcs)
+ {
+ string ext(p.end() - 3, p.end());
+ if (ext == ".gz" || ext == ".bz2")
+ {
+ gz_bz2 = true;
+ }
+ FILE* tmp = my_fopen(p.c_str(), "rb");
+ if (!tmp)
+ {
+ cout << "Cannot open file: " << p.c_str();
+ exit(1);
+ }
+ my_fseek(tmp, 0, SEEK_END);
+ file_sizes.push_back(my_ftell(tmp));
+ fclose(tmp);
+ }
+ if (gz_bz2)
+ {
+ sort(file_sizes.begin(), file_sizes.end(), greater<uint64>());
+ uint64 file_size_threshold = (uint64)(file_sizes.front() * 0.05);
+ int32 n_allowed_files = 0;
+ for (auto& p : file_sizes)
+ if (p > file_size_threshold)
+ ++n_allowed_files;
+ filtering_params.n_readers = MIN(n_allowed_files, MAX(1, avaiable_threads / 2));
+ }
+ else
+ filtering_params.n_readers = 1;
+
+
+
+ avaiable_threads -= filtering_params.n_readers;
+ filtering_params.n_filters = max(1, avaiable_threads);
+
+ filtering_params.fastq_buffer_size = 1 << 25;
+
+ filtering_params.mem_part_pmm_fastq_reader = filtering_params.fastq_buffer_size + CFastqReader::OVERHEAD_SIZE;
+ filtering_params.mem_tot_pmm_fastq_reader = filtering_params.mem_part_pmm_fastq_reader * (filtering_params.n_readers + 48);
+
+ filtering_params.mem_part_pmm_fastq_filter = filtering_params.mem_part_pmm_fastq_reader;
+ filtering_params.mem_tot_pmm_fastq_filter = filtering_params.mem_part_pmm_fastq_filter * (filtering_params.n_filters + 48);
+
+ filtering_queues.input_files_queue = new CInputFilesQueue(filtering_params.input_srcs);
+ filtering_queues.input_part_queue = new CPartQueue(filtering_params.n_readers);
+ filtering_queues.filtered_part_queue = new CPartQueue(filtering_params.n_filters);
+
+ filtering_queues.pmm_fastq_reader = new CMemoryPool(filtering_params.mem_tot_pmm_fastq_reader, filtering_params.mem_part_pmm_fastq_reader);
+ filtering_queues.pmm_fastq_filter = new CMemoryPool(filtering_params.mem_tot_pmm_fastq_filter, filtering_params.mem_part_pmm_fastq_filter);
+
+
+ filtering_params.kmer_len = config.headers.front().kmer_len;
+
+ vector<thread> readers_ths;
+ vector<thread> filters_ths;
+ vector<unique_ptr<CWFastqFilter>> filters;
+ vector<unique_ptr<CWFastqReader>> readers;
+
+ CKMCFile kmc_api;
+ if (!kmc_api.OpenForRA(config.input_desc.front().file_src))
+ {
+ cout << "Error: cannot open: " << config.input_desc.front().file_src << " by KMC API\n";
+ exit(1);
+ }
+ kmc_api.SetMinCount(config.input_desc.front().cutoff_min);
+ kmc_api.SetMaxCount(config.input_desc.front().cutoff_max);
+
+ CWFastqWriter writer(filtering_params, filtering_queues);
+ thread writer_th(writer);
+
+ for (uint32 i = 0; i < filtering_params.n_filters; ++i)
+ {
+ filters.push_back(make_unique<CWFastqFilter>(filtering_params, filtering_queues, kmc_api));
+ filters_ths.emplace_back(ref(*filters.back()));
+ }
+
+ for (uint32 i = 0; i < filtering_params.n_readers; ++i)
+ {
+ readers.push_back(make_unique<CWFastqReader>(filtering_params, filtering_queues));
+ readers_ths.emplace_back(ref(*readers.back()));
+ }
+
+ writer_th.join();
+ for (auto& thread : filters_ths)
+ thread.join();
+
+ filters.clear();
+
+
+ for (auto& thread : readers_ths)
+ thread.join();
+
+ readers.clear();
+
+ delete filtering_queues.input_part_queue;
+ delete filtering_queues.pmm_fastq_reader;
+ delete filtering_queues.pmm_fastq_filter;
+ delete filtering_queues.input_files_queue;
+ delete filtering_queues.filtered_part_queue;
+
+ return true;
+ }
+
+public:
+ CTools(CParametersParser& parameters_parser) :
+ parameters_parser(parameters_parser),
+ config(CConfig::GetInstance())
+ {
+ }
+ bool Process()
+ {
+ if (config.mode == CConfig::Mode::FILTER)
+ {
+ return filter();
+ }
+ if (config.mode == CConfig::Mode::HISTOGRAM)
+ {
+ return histo();
+ }
+ else if (config.mode == CConfig::Mode::DUMP)
+ {
+ return dump();
+ }
+ else if (config.mode == CConfig::Mode::COMPARE)
+ {
+ CInput<SIZE> *db1, *db2;
+ if (!config.headers[0].IsKMC2())
+ db1 = new CKMC1DbReader<SIZE>(config.headers[0], config.input_desc[0], CConfig::GetInstance().percent_progress, KMCDBOpenMode::sorted);
+ else
+ db1 = new CKMC2DbReader<SIZE>(config.headers[0], config.input_desc[0], CConfig::GetInstance().percent_progress, KMCDBOpenMode::sorted);
+
+ if (!config.headers[1].IsKMC2())
+ db2 = new CKMC1DbReader<SIZE>(config.headers[1], config.input_desc[1], CConfig::GetInstance().percent_progress, KMCDBOpenMode::sorted);
+ else
+ db2 = new CKMC2DbReader<SIZE>(config.headers[1], config.input_desc[1], CConfig::GetInstance().percent_progress, KMCDBOpenMode::sorted);
+
+ CBundle<SIZE> input1(db1), input2(db2);
+ CComparer<SIZE> comparer(&input1, &input2);
+
+ bool res = comparer.Equals();
+
+ delete db1;
+ delete db2;
+ std::cout << "\n";
+ if (res)
+ {
+ cout << "DB Equals\n";
+ exit(0);
+ }
+ else
+ {
+ cout << "DB Differs\n";
+ exit(1);
+ }
+ }
+ else
+ {
+ CExpressionNode<SIZE>* expression_root = parameters_parser.GetExpressionRoot<SIZE>();
+ auto t = expression_root->GetExecutionRoot();
+ delete expression_root;
+ CKMC1DbWriter<SIZE> writer(t);
+ writer.Process();
+ delete t;
+ return true;
+ }
+ return false;
+ }
+
+
+
+};
+
+
+template<unsigned SIZE> class CApplication
+{
+ CApplication<SIZE - 1>* app_1;
+ CTools<SIZE>* tools;
+ bool is_selected;
+ CConfig& config;
+ CParametersParser& parameter_parser;
+public:
+ CApplication(CParametersParser& parameter_parser) :
+ config(CConfig::GetInstance()), parameter_parser(parameter_parser)
+ {
+ is_selected = config.kmer_len <= (int32)SIZE * 32 && config.kmer_len > ((int32)SIZE - 1) * 32;
+
+ app_1 = new CApplication<SIZE - 1>(parameter_parser);
+ if (is_selected)
+ {
+ tools = new CTools<SIZE>(parameter_parser);
+ }
+ else
+ {
+ tools = nullptr;
+ }
+ }
+
+ ~CApplication()
+ {
+ delete app_1;
+ if (is_selected)
+ delete tools;
+ }
+
+ bool Process()
+ {
+ if (is_selected)
+ return tools->Process();
+ else
+ return app_1->Process();
+ }
+};
+
+template<> class CApplication<1>
+{
+ CTools<1>* tools;
+ CConfig& config;
+ CParametersParser& parameter_parser;
+ bool is_selected;
+public:
+ CApplication(CParametersParser& parameter_parser) :
+ config(CConfig::GetInstance()), parameter_parser(parameter_parser)
+ {
+ is_selected = config.kmer_len <= 32;
+
+ if (is_selected)
+ tools = new CTools<1>(parameter_parser);
+ else
+ tools = nullptr;
+ }
+ ~CApplication<1>()
+ {
+ if (tools)
+ delete tools;
+ }
+ bool Process() {
+ if (is_selected)
+ {
+ return tools->Process();
+ }
+ return false;
+ }
+};
+
+
+int main(int argc, char**argv)
+{
+#ifdef ENABLE_LOGGER
+ CTimer timer;
+ timer.start();
+#endif
+ CParametersParser params_parser(argc, argv);
+ params_parser.Parse();
+ if (params_parser.validate_input_dbs())
+ {
+ params_parser.SetThreads();
+ CApplication<KMER_WORDS> app(params_parser);
+ app.Process();
+ }
+
+#ifdef ENABLE_LOGGER
+
+ cout << "RUN TIME: " << timer.get_time() <<"ms\n\n";
+
+ CLoger::GetLogger().print_stats();
+
+#endif
+}
+
+
+// ***** EOF
\ No newline at end of file
diff --git a/kmer_counter/kmer_counter.vcxproj b/kmc_tools/kmc_tools.vcxproj
similarity index 65%
copy from kmer_counter/kmer_counter.vcxproj
copy to kmc_tools/kmc_tools.vcxproj
index 207867f..92e7262 100644
--- a/kmer_counter/kmer_counter.vcxproj
+++ b/kmc_tools/kmc_tools.vcxproj
@@ -19,38 +19,36 @@
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
- <ProjectGuid>{8C8B90DA-28B7-4D82-81F3-C0E7CE52D59F}</ProjectGuid>
+ <ProjectGuid>{F3B0CC94-9DD0-4642-891C-EA08BDA50260}</ProjectGuid>
<Keyword>Win32Proj</Keyword>
- <RootNamespace>kmer_counter</RootNamespace>
+ <RootNamespace>kmc_tools</RootNamespace>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
- <CharacterSet>NotSet</CharacterSet>
<PlatformToolset>v120</PlatformToolset>
+ <CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
- <CharacterSet>NotSet</CharacterSet>
- <UseOfMfc>Static</UseOfMfc>
<PlatformToolset>v120</PlatformToolset>
+ <CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
- <WholeProgramOptimization>true</WholeProgramOptimization>
- <CharacterSet>NotSet</CharacterSet>
<PlatformToolset>v120</PlatformToolset>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ <CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
- <WholeProgramOptimization>true</WholeProgramOptimization>
- <CharacterSet>NotSet</CharacterSet>
- <UseOfMfc>Static</UseOfMfc>
<PlatformToolset>v120</PlatformToolset>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ <CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
@@ -85,16 +83,12 @@
<PrecompiledHeader>Use</PrecompiledHeader>
<WarningLevel>Level3</WarningLevel>
<Optimization>Disabled</Optimization>
- <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
- <MultiProcessorCompilation>true</MultiProcessorCompilation>
- <OpenMPSupport>true</OpenMPSupport>
- <AdditionalIncludeDirectories>%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
- <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+ <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;WIN32;_DEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <SDLCheck>true</SDLCheck>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
- <AdditionalLibraryDirectories>%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
@@ -102,19 +96,13 @@
<PrecompiledHeader>Use</PrecompiledHeader>
<WarningLevel>Level3</WarningLevel>
<Optimization>Disabled</Optimization>
- <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
- <AdditionalIncludeDirectories>%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
- <MultiProcessorCompilation>true</MultiProcessorCompilation>
- <OpenMPSupport>true</OpenMPSupport>
- <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
- <AdditionalOptions>/D "_VARIADIC_MAX=10" /bigobj %(AdditionalOptions)</AdditionalOptions>
+ <PreprocessorDefinitions>_SCL_SECURE_NO_WARNINGS;_CRT_SECURE_NO_WARNINGS;WIN32;_DEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <SDLCheck>true</SDLCheck>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
- <AdditionalLibraryDirectories>%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
- <IgnoreAllDefaultLibraries>
- </IgnoreAllDefaultLibraries>
+ <AdditionalDependencies />
<IgnoreSpecificDefaultLibraries>libcmt.lib</IgnoreSpecificDefaultLibraries>
</Link>
</ItemDefinitionGroup>
@@ -125,15 +113,15 @@
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
- <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
- <AdditionalIncludeDirectories>%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+ <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <SDLCheck>true</SDLCheck>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
- <AdditionalLibraryDirectories>%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+ <Profile>true</Profile>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
@@ -143,90 +131,81 @@
<Optimization>Full</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
- <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
- <MultiProcessorCompilation>true</MultiProcessorCompilation>
- <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
- <OpenMPSupport>true</OpenMPSupport>
- <AdditionalIncludeDirectories>%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+ <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <SDLCheck>true</SDLCheck>
<InlineFunctionExpansion>Default</InlineFunctionExpansion>
- <EnableEnhancedInstructionSet>NotSet</EnableEnhancedInstructionSet>
- <AdditionalOptions>/D "_VARIADIC_MAX=10"</AdditionalOptions>
+ <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+ <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
- <GenerateDebugInformation>false</GenerateDebugInformation>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
- <AdditionalLibraryDirectories>%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+ <Profile>true</Profile>
+ <AdditionalDependencies />
</Link>
</ItemDefinitionGroup>
<ItemGroup>
- <None Include="ReadMe.txt" />
+ <Text Include="ReadMe.txt" />
</ItemGroup>
<ItemGroup>
+ <ClInclude Include="..\kmc_api\kmc_file.h" />
+ <ClInclude Include="..\kmc_api\kmer_api.h" />
+ <ClInclude Include="..\kmc_api\mmer.h" />
<ClInclude Include="asmlib_wrapper.h" />
- <ClInclude Include="bkb_merger.h" />
- <ClInclude Include="bkb_reader.h" />
- <ClInclude Include="bkb_sorter.h" />
- <ClInclude Include="bkb_subbin.h" />
- <ClInclude Include="bkb_writer.h" />
+ <ClInclude Include="bundle.h" />
+ <ClInclude Include="config.h" />
<ClInclude Include="defs.h" />
- <ClInclude Include="develop.h" />
+ <ClInclude Include="dump_writer.h" />
+ <ClInclude Include="expression_node.h" />
+ <ClInclude Include="fastq_filter.h" />
<ClInclude Include="fastq_reader.h" />
- <ClInclude Include="bkb_uncompactor.h" />
- <ClInclude Include="kb_collector.h" />
- <ClInclude Include="kb_completer.h" />
- <ClInclude Include="kb_reader.h" />
- <ClInclude Include="kb_sorter.h" />
- <ClInclude Include="kb_storer.h" />
- <ClInclude Include="kmc.h" />
+ <ClInclude Include="fastq_writer.h" />
+ <ClInclude Include="histogram_writer.h" />
+ <ClInclude Include="kmc1_db_reader.h" />
+ <ClInclude Include="kmc1_db_writer.h" />
+ <ClInclude Include="kmc2_db_reader.h" />
+ <ClInclude Include="kmc_header.h" />
<ClInclude Include="kmer.h" />
- <ClInclude Include="kxmer_set.h" />
<ClInclude Include="libs\asmlib.h" />
<ClInclude Include="libs\bzlib.h" />
<ClInclude Include="libs\bzlib_private.h" />
<ClInclude Include="libs\zconf.h" />
<ClInclude Include="libs\zlib.h" />
- <ClInclude Include="mem_disk_file.h" />
<ClInclude Include="meta_oper.h" />
- <ClInclude Include="mmer.h" />
- <ClInclude Include="rev_byte.h" />
- <ClInclude Include="s_mapper.h" />
- <ClInclude Include="params.h" />
+ <ClInclude Include="nc_utils.h" />
+ <ClInclude Include="operations.h" />
+ <ClInclude Include="output_parser.h" />
+ <ClInclude Include="parameters_parser.h" />
+ <ClInclude Include="parser.h" />
+ <ClInclude Include="percent_progress.h" />
<ClInclude Include="queues.h" />
- <ClInclude Include="radix.h" />
- <ClInclude Include="splitter.h" />
<ClInclude Include="stdafx.h" />
<ClInclude Include="targetver.h" />
<ClInclude Include="timer.h" />
+ <ClInclude Include="tokenizer.h" />
</ItemGroup>
<ItemGroup>
- <ClCompile Include="bkb_reader.cpp" />
- <ClCompile Include="bkb_writer.cpp" />
- <ClCompile Include="develop.cpp" />
+ <ClCompile Include="..\kmc_api\kmc_file.cpp" />
+ <ClCompile Include="..\kmc_api\kmer_api.cpp" />
+ <ClCompile Include="..\kmc_api\mmer.cpp" />
+ <ClCompile Include="fastq_filter.cpp" />
<ClCompile Include="fastq_reader.cpp" />
- <ClCompile Include="kb_completer.cpp" />
- <ClCompile Include="kb_storer.cpp" />
- <ClCompile Include="kmer.cpp" />
- <ClCompile Include="kmer_counter.cpp" />
- <ClCompile Include="mem_disk_file.cpp" />
- <ClCompile Include="mmer.cpp" />
- <ClCompile Include="radix.cpp" />
- <ClCompile Include="rev_byte.cpp" />
+ <ClCompile Include="fastq_writer.cpp" />
+ <ClCompile Include="kmc_header.cpp" />
+ <ClCompile Include="kmc_tools.cpp" />
+ <ClCompile Include="nc_utils.cpp" />
+ <ClCompile Include="parameters_parser.cpp" />
+ <ClCompile Include="parser.cpp" />
+ <ClCompile Include="percent_progress.cpp" />
<ClCompile Include="stdafx.cpp">
<PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Create</PrecompiledHeader>
<PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Create</PrecompiledHeader>
<PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Create</PrecompiledHeader>
<PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Create</PrecompiledHeader>
</ClCompile>
- <ClCompile Include="timer.cpp" />
- </ItemGroup>
- <ItemGroup>
- <Reference Include="System" />
- <Reference Include="System.Data" />
- <Reference Include="System.Drawing" />
- <Reference Include="System.Windows.Forms" />
- <Reference Include="System.Xml" />
+ <ClCompile Include="tokenizer.cpp" />
</ItemGroup>
<ItemGroup>
<Library Include="libs\alibcof64.lib" />
diff --git a/kmc_tools/kmer.h b/kmc_tools/kmer.h
new file mode 100644
index 0000000..48090c9
--- /dev/null
+++ b/kmc_tools/kmer.h
@@ -0,0 +1,522 @@
+/*
+This file is a part of KMC software distributed under GNU GPL 3 licence.
+The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
+
+Version: 2.3.0
+Date : 2015-08-21
+*/
+
+#ifndef _KMER_H
+#define _KMER_H
+
+// Important remark: there is no inheritance here to guarantee that all classes defined here are POD according to C++11
+
+#include "defs.h"
+#include "meta_oper.h"
+#include <string>
+
+// *************************************************************************
+// Ckmer class for k > 32 with classic kmer counting
+template<unsigned SIZE> struct CKmer {
+ unsigned long long data[SIZE];
+
+ typedef unsigned long long data_t;
+ inline void set(const CKmer<SIZE> &x);
+
+ inline void mask(const CKmer<SIZE> &x);
+ inline uint32 end_mask(const uint32 mask);
+ inline void set_2bits(const uint64 x, const uint32 p);
+ inline uchar get_2bits(const uint32 p);
+ inline uchar get_byte(const uint32 p);
+ inline void set_byte(const uint32 p, uchar x);
+ inline void set_bits(const uint32 p, const uint32 n, uint64 x);
+
+ inline void SHL_insert_2bits(const uint64 x);
+ inline void SHR_insert_2bits(const uint64 x, const uint32 p);
+
+ inline void SHR(const uint32 p);
+ inline void SHL(const uint32 p);
+
+ inline uint64 remove_suffix(const uint32 n) const;
+ inline void set_n_1(const uint32 n);
+ inline void set_n_01(const uint32 n);
+
+ inline void store(uchar *&buffer, int32 n);
+ inline void store(uchar *buffer, int32 p, int32 n);
+ inline void load(uchar *&buffer, int32 n);
+
+ inline bool operator==(const CKmer<SIZE> &x);
+ inline bool operator<(const CKmer<SIZE> &x);
+
+ inline void clear(void);
+
+ inline char get_symbol(int p);
+};
+
+
+// *********************************************************************
+template<unsigned SIZE> inline void CKmer<SIZE>::set(const CKmer<SIZE> &x)
+{
+#ifdef USE_META_PROG
+ IterFwd([&](const int &i){
+ data[i] = x.data[i];
+ }, uint_<SIZE - 1>());
+#else
+ for (uint32 i = 0; i < SIZE; ++i)
+ data[i] = x.data[i];
+#endif
+}
+
+// *********************************************************************
+template<unsigned SIZE> inline void CKmer<SIZE>::mask(const CKmer<SIZE> &x)
+{
+#ifdef USE_META_PROG
+ IterFwd([&](const int &i){
+ data[i] &= x.data[i];
+ }, uint_<SIZE - 1>());
+#else
+ for (uint32 i = 0; i < SIZE; ++i)
+ data[i] &= x.data[i];
+#endif
+}
+
+// *********************************************************************
+template<unsigned SIZE> inline uint32 CKmer<SIZE>::end_mask(const uint32 mask)
+{
+ return data[0] & mask;
+}
+// *********************************************************************
+template<unsigned SIZE> inline void CKmer<SIZE>::set_2bits(const uint64 x, const uint32 p)
+{
+ // data[p >> 6] |= x << (p & 63);
+ data[p >> 6] += x << (p & 63);
+}
+
+template<unsigned SIZE> inline uchar CKmer<SIZE>::get_2bits(const uint32 p)
+{
+ return (data[p >> 6] >> (p & 63)) & 3;
+}
+// *********************************************************************
+template<unsigned SIZE> inline void CKmer<SIZE>::SHR_insert_2bits(const uint64 x, const uint32 p)
+{
+#ifdef USE_META_PROG
+ IterFwd([&](const int &i){
+ data[i] >>= 2;
+ // data[i] |= data[i+1] << (64-2);
+ data[i] += data[i + 1] << (64 - 2);
+ }, uint_<SIZE - 2>());
+#else
+ for (uint32 i = 0; i < SIZE - 1; ++i)
+ {
+ data[i] >>= 2;
+ // data[i] |= data[i+1] << (64-2);
+ data[i] += data[i + 1] << (64 - 2);
+ }
+#endif
+ data[SIZE - 1] >>= 2;
+
+ // data[p >> 6] |= x << (p & 63);
+ data[p >> 6] += x << (p & 63);
+}
+
+
+
+// *********************************************************************
+template<unsigned SIZE> inline void CKmer<SIZE>::SHR(const uint32 p)
+{
+#ifdef USE_META_PROG
+ IterFwd([&](const int &i){
+ data[i] >>= 2 * p;
+ // data[i] |= data[i+1] << (64-2*p);
+ data[i] += data[i + 1] << (64 - 2 * p);
+ }, uint_<SIZE - 2>());
+#else
+ for (uint32 i = 0; i < SIZE - 1; ++i)
+ {
+ data[i] >>= 2 * p;
+ // data[i] |= data[i+1] << (64-2*p);
+ data[i] += data[i + 1] << (64 - 2 * p);
+ }
+#endif
+ data[SIZE - 1] >>= 2 * p;
+}
+
+// *********************************************************************
+template<unsigned SIZE> inline void CKmer<SIZE>::SHL(const uint32 p)
+{
+#ifdef USE_META_PROG
+ IterRev([&](const int &i){
+ data[i + 1] <<= p * 2;
+ // data[i+1] |= data[i] >> (64-p*2);
+ data[i + 1] += data[i] >> (64 - p * 2);
+ }, uint_<SIZE - 2>());
+#else
+ for (uint32 i = SIZE - 1; i > 0; --i)
+ {
+ data[i] <<= p * 2;
+ // data[i] |= data[i-1] >> (64-p*2);
+ data[i] += data[i - 1] >> (64 - p * 2);
+ }
+#endif
+ data[0] <<= p * 2;
+}
+
+// *********************************************************************
+template<unsigned SIZE> inline void CKmer<SIZE>::SHL_insert_2bits(const uint64 x)
+{
+#ifdef USE_META_PROG
+ IterRev([&](const int &i){
+ data[i + 1] <<= 2;
+ // data[i+1] |= data[i] >> (64-2);
+ data[i + 1] += data[i] >> (64 - 2);
+ }, uint_<SIZE - 2>());
+#else
+ for (uint32 i = SIZE - 1; i > 0; --i)
+ {
+ data[i] <<= 2;
+ // data[i] |= data[i-1] >> (64-2);
+ data[i] += data[i - 1] >> (64 - 2);
+ }
+#endif
+ data[0] <<= 2;
+ // data[0] |= x;
+ data[0] += x;
+}
+
+// *********************************************************************
+template<unsigned SIZE> inline uchar CKmer<SIZE>::get_byte(const uint32 p)
+{
+ return (data[p >> 3] >> ((p << 3) & 63)) & 0xFF;
+}
+
+// *********************************************************************
+template<unsigned SIZE> inline void CKmer<SIZE>::set_byte(const uint32 p, uchar x)
+{
+ // data[p >> 3] |= ((uint64) x) << ((p & 7) << 3);
+ data[p >> 3] += ((uint64)x) << ((p & 7) << 3);
+}
+
+// *********************************************************************
+template<unsigned SIZE> inline void CKmer<SIZE>::set_bits(const uint32 p, const uint32 n, uint64 x)
+{
+ // data[p >> 6] |= x << (p & 63);
+ data[p >> 6] += x << (p & 63);
+ if ((p >> 6) != ((p + n - 1) >> 6))
+ // data[(p >> 6) + 1] |= x >> (64 - (p & 63));
+ data[(p >> 6) + 1] += x >> (64 - (p & 63));
+}
+
+// *********************************************************************
+template<unsigned SIZE> inline bool CKmer<SIZE>::operator==(const CKmer<SIZE> &x) {
+ for (uint32 i = 0; i < SIZE; ++i)
+ if (data[i] != x.data[i])
+ return false;
+
+ return true;
+}
+
+// *********************************************************************
+template<unsigned SIZE> inline bool CKmer<SIZE>::operator<(const CKmer<SIZE> &x) {
+ for (int32 i = SIZE - 1; i >= 0; --i)
+ if (data[i] < x.data[i])
+ return true;
+ else if (data[i] > x.data[i])
+ return false;
+ return false;
+}
+
+
+
+// *********************************************************************
+template<unsigned SIZE> inline void CKmer<SIZE>::clear(void)
+{
+#ifdef USE_META_PROG
+ IterFwd([&](const int &i){
+ data[i] = 0;
+ }, uint_<SIZE - 1>());
+#else
+ for (uint32 i = 0; i < SIZE; ++i)
+ data[i] = 0;
+#endif
+}
+
+// *********************************************************************
+template<unsigned SIZE> inline uint64 CKmer<SIZE>::remove_suffix(const uint32 n) const
+{
+ uint32 p = n >> 6; // / 64;
+ uint32 r = n & 63; // % 64;
+
+ if (p == SIZE - 1)
+ return data[p] >> r;
+ else
+ // return (data[p+1] << (64-r)) | (data[p] >> r);
+ return (data[p + 1] << (64 - r)) + (data[p] >> r);
+}
+
+// *********************************************************************
+template<unsigned SIZE> inline void CKmer<SIZE>::set_n_1(const uint32 n)
+{
+ clear();
+
+ for (uint32 i = 0; i < (n >> 6); ++i)
+ data[i] = ~((uint64)0);
+
+ uint32 r = n & 63;
+
+ if (r)
+ data[n >> 6] = (1ull << r) - 1;
+}
+
+
+// *********************************************************************
+template<unsigned SIZE> inline void CKmer<SIZE>::set_n_01(const uint32 n)
+{
+ clear();
+
+ for (uint32 i = 0; i < n; ++i)
+ if (!(i & 1))
+ // data[i >> 6] |= (1ull << (i & 63));
+ data[i >> 6] += (1ull << (i & 63));
+}
+
+// *********************************************************************
+template<unsigned SIZE> inline void CKmer<SIZE>::store(uchar *&buffer, int32 n)
+{
+ for (int32 i = n - 1; i >= 0; --i)
+ *buffer++ = get_byte(i);
+}
+
+// *********************************************************************
+template<unsigned SIZE> inline void CKmer<SIZE>::store(uchar *buffer, int32 p, int32 n)
+{
+ for (int32 i = n - 1; i >= 0; --i)
+ buffer[p++] = get_byte(i);
+}
+
+// *********************************************************************
+template<unsigned SIZE> inline void CKmer<SIZE>::load(uchar *&buffer, int32 n)
+{
+ clear();
+ for (int32 i = n - 1; i >= 0; --i)
+ set_byte(i, *buffer++);
+}
+
+// *********************************************************************
+template<unsigned SIZE> inline char CKmer<SIZE>::get_symbol(int p)
+{
+ uint32 x = (data[p >> 5] >> (2 * (p & 31))) & 0x03;
+
+ switch (x)
+ {
+ case 0: return 'A';
+ case 1: return 'C';
+ case 2: return 'G';
+ default: return 'T';
+ }
+}
+
+// *********************************************************************
+// *********************************************************************
+// *********************************************************************
+// *********************************************************************
+// Ckmer class for k <= 32 with classic kmer counting
+template<> struct CKmer<1> {
+ unsigned long long data;
+
+
+ typedef unsigned long long data_t;
+ static uint32 QUALITY_SIZE;
+
+ void set(const CKmer<1> &x);
+
+ void mask(const CKmer<1> &x);
+ uint32 end_mask(const uint32 mask);
+ void set_2bits(const uint64 x, const uint32 p);
+ uchar get_2bits(const uint32 p);
+ uchar get_byte(const uint32 p);
+ void set_byte(const uint32 p, uchar x);
+ void set_bits(const uint32 p, const uint32 n, uint64 x);
+
+ void SHL_insert_2bits(const uint64 x);
+ void SHR_insert_2bits(const uint64 x, const uint32 p);
+
+ void SHR(const uint32 p);
+ void SHL(const uint32 p);
+
+ uint64 remove_suffix(const uint32 n) const;
+ void set_n_1(const uint32 n);
+ void set_n_01(const uint32 n);
+
+ void store(uchar *&buffer, int32 n);
+ void store(uchar *buffer, int32 p, int32 n);
+ void load(uchar *&buffer, int32 n);
+
+ bool operator==(const CKmer<1> &x);
+ bool operator<(const CKmer<1> &x);
+
+ void clear(void);
+
+ inline char get_symbol(int p);
+};
+
+
+// *********************************************************************
+inline void CKmer<1>::mask(const CKmer<1> &x)
+{
+ data &= x.data;
+}
+
+
+// *********************************************************************
+inline uint32 CKmer<1>::end_mask(const uint32 mask)
+{
+ return data & mask;
+}
+// *********************************************************************
+inline void CKmer<1>::set(const CKmer<1> &x)
+{
+ data = x.data;
+}
+
+// *********************************************************************
+inline void CKmer<1>::set_2bits(const uint64 x, const uint32 p)
+{
+ // data |= x << p;
+ data += x << p;
+}
+
+inline uchar CKmer<1>::get_2bits(const uint32 p)
+{
+ return (data >> p) & 3;
+}
+// *********************************************************************
+inline void CKmer<1>::SHR_insert_2bits(const uint64 x, const uint32 p)
+{
+ data >>= 2;
+ // data |= x << p;
+ data += x << p;
+}
+
+// *********************************************************************
+inline void CKmer<1>::SHR(const uint32 p)
+{
+ data >>= 2 * p;
+}
+
+// *********************************************************************
+inline void CKmer<1>::SHL(const uint32 p)
+{
+ data <<= p * 2;
+}
+// *********************************************************************
+inline void CKmer<1>::SHL_insert_2bits(const uint64 x)
+{
+ // data = (data << 2) | x;
+ data = (data << 2) + x;
+}
+
+// *********************************************************************
+inline uchar CKmer<1>::get_byte(const uint32 p)
+{
+ return (data >> (p << 3)) & 0xFF;
+}
+
+// *********************************************************************
+inline void CKmer<1>::set_byte(const uint32 p, uchar x)
+{
+ // data |= ((uint64) x) << (p << 3);
+ data += ((uint64)x) << (p << 3);
+}
+
+// *********************************************************************
+inline void CKmer<1>::set_bits(const uint32 p, const uint32 n, uint64 x)
+{
+ // data |= x << p;
+ data += x << p;
+}
+
+// *********************************************************************
+inline bool CKmer<1>::operator==(const CKmer<1> &x) {
+ return data == x.data;
+}
+
+// *********************************************************************
+inline bool CKmer<1>::operator<(const CKmer<1> &x) {
+ return data < x.data;
+}
+
+// *********************************************************************
+inline void CKmer<1>::clear(void)
+{
+ data = 0ull;
+}
+
+// *********************************************************************
+inline uint64 CKmer<1>::remove_suffix(const uint32 n) const
+{
+ return data >> n;
+}
+
+// *********************************************************************
+inline void CKmer<1>::set_n_1(const uint32 n)
+{
+ if (n == 64)
+ data = ~(0ull);
+ else
+ data = (1ull << n) - 1;
+}
+
+// *********************************************************************
+inline void CKmer<1>::set_n_01(const uint32 n)
+{
+ data = 0ull;
+
+ for (uint32 i = 0; i < n; ++i)
+ if (!(i & 1))
+ data += (1ull << i);
+}
+
+// *********************************************************************
+inline void CKmer<1>::store(uchar *&buffer, int32 n)
+{
+ for (int32 i = n - 1; i >= 0; --i)
+ *buffer++ = get_byte(i);
+}
+
+// *********************************************************************
+inline void CKmer<1>::store(uchar *buffer, int32 p, int32 n)
+{
+ for (int32 i = n - 1; i >= 0; --i)
+ buffer[p++] = get_byte(i);
+}
+
+// *********************************************************************
+inline void CKmer<1>::load(uchar *&buffer, int32 n)
+{
+ clear();
+ for (int32 i = n - 1; i >= 0; --i)
+ set_byte(i, *buffer++);
+}
+
+
+// *********************************************************************
+char CKmer<1>::get_symbol(int p)
+{
+ uint32 x = (data >> (2 * p)) & 0x03;
+
+ switch (x)
+ {
+ case 0: return 'A';
+ case 1: return 'C';
+ case 2: return 'G';
+ default: return 'T';
+ }
+}
+
+#endif
+
+// ***** EOF
+
+
diff --git a/kmc_tools/libs/alibcof64.lib b/kmc_tools/libs/alibcof64.lib
new file mode 100644
index 0000000..a0cc545
Binary files /dev/null and b/kmc_tools/libs/alibcof64.lib differ
diff --git a/kmc_tools/libs/alibelf64.a b/kmc_tools/libs/alibelf64.a
new file mode 100644
index 0000000..48beca6
Binary files /dev/null and b/kmc_tools/libs/alibelf64.a differ
diff --git a/kmc_tools/libs/asmlib.h b/kmc_tools/libs/asmlib.h
new file mode 100644
index 0000000..11ab4b2
--- /dev/null
+++ b/kmc_tools/libs/asmlib.h
@@ -0,0 +1,265 @@
+/*************************** asmlib.h ***************************************
+* Author: Agner Fog
+* Date created: 2003-12-12
+* Last modified: 2011-08-21
+* Project: asmlib.zip
+* Source URL: www.agner.org/optimize
+*
+* Description:
+* Header file for the asmlib function library.
+* This library is available in many versions for different platforms.
+* See asmlib-instructions.pdf for details.
+*
+* Copyright 2003 - 2011 by Agner Fog.
+* GNU General Public License http://www.gnu.org/licenses/gpl.html
+*****************************************************************************/
+
+
+#ifndef ASMLIB_H
+#define ASMLIB_H
+
+
+/***********************************************************************
+Define compiler-specific types and directives
+***********************************************************************/
+
+// Define type size_t
+#ifndef _SIZE_T_DEFINED
+#include "stddef.h"
+#endif
+
+// Define integer types with known size: int32_t, uint32_t, int64_t, uint64_t.
+// If this doesn't work then insert compiler-specific definitions here:
+#if defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER >= 1600)
+ // Compilers supporting C99 or C++0x have stdint.h defining these integer types
+ #include <stdint.h>
+ #define INT64_SUPPORTED // Remove this if the compiler doesn't support 64-bit integers
+#elif defined(_MSC_VER)
+ // Older Microsoft compilers have their own definition
+ typedef signed __int16 int16_t;
+ typedef unsigned __int16 uint16_t;
+ typedef signed __int32 int32_t;
+ typedef unsigned __int32 uint32_t;
+ typedef signed __int64 int64_t;
+ typedef unsigned __int64 uint64_t;
+ #define INT64_SUPPORTED // Remove this if the compiler doesn't support 64-bit integers
+#else
+ // This works with most compilers
+ typedef signed short int int16_t;
+ typedef unsigned short int uint16_t;
+ typedef signed int int32_t;
+ typedef unsigned int uint32_t;
+ typedef long long int64_t;
+ typedef unsigned long long uint64_t;
+ #define INT64_SUPPORTED // Remove this if the compiler doesn't support 64-bit integers
+#endif
+
+
+// Turn off name mangling
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/***********************************************************************
+Function prototypes, memory and string functions
+***********************************************************************/
+void * A_memcpy (void * dest, const void * src, size_t count); // Copy count bytes from src to dest
+void * A_memmove(void * dest, const void * src, size_t count); // Same as memcpy, allows overlap between src and dest
+void * A_memset (void * dest, int c, size_t count); // Set count bytes in dest to (char)c
+size_t GetMemcpyCacheLimit(void); // Data blocks bigger than this will be copied uncached by memcpy and memmove
+void SetMemcpyCacheLimit(size_t); // Change limit in GetMemcpyCacheLimit
+size_t GetMemsetCacheLimit(void); // Data blocks bigger than this will be stored uncached by memset
+void SetMemsetCacheLimit(size_t); // Change limit in GetMemsetCacheLimit
+char * A_strcat (char * dest, const char * src); // Concatenate strings dest and src. Store result in dest
+char * A_strcpy (char * dest, const char * src); // Copy string src to dest
+size_t A_strlen (const char * str); // Get length of zero-terminated string
+int A_strcmp (const char * a, const char * b); // Compare strings. Case sensitive
+int A_stricmp (const char *string1, const char *string2); // Compare strings. Case insensitive for A-Z only
+char * A_strstr (char * haystack, const char * needle); // Search for substring in string
+void A_strtolower(char * string); // Convert string to lower case for A-Z only
+void A_strtoupper(char * string); // Convert string to upper case for a-z only
+size_t A_substring(char * dest, const char * source, size_t pos, size_t len); // Copy a substring for source into dest
+size_t A_strspn (const char * str, const char * set); // Find span of characters that belong to set
+size_t A_strcspn(const char * str, const char * set); // Find span of characters that don't belong to set
+size_t strCountInSet(const char * str, const char * set); // Count characters that belong to set
+size_t strcount_UTF8(const char * str); // Counts the number of characters in a UTF-8 encoded string
+
+/***********************************************************************
+Function prototypes, miscellaneous functions
+***********************************************************************/
+uint32_t A_popcount(uint32_t x); // Count 1-bits in 32-bit integer
+int RoundD (double x); // Round to nearest or even
+int RoundF (float x); // Round to nearest or even
+int InstructionSet(void); // Tell which instruction set is supported
+char * ProcessorName(void); // ASCIIZ text describing microprocessor
+void CpuType(int * vendor, int * family, int * model); // Get CPU vendor, family and model
+size_t DataCacheSize(int level); // Get size of data cache
+void A_DebugBreak(void); // Makes a debug breakpoint
+#ifdef INT64_SUPPORTED
+ uint64_t ReadTSC(void); // Read microprocessor internal clock (64 bits)
+#else
+ uint32_t ReadTSC(void); // Read microprocessor internal clock (only 32 bits supported by compiler)
+#endif
+void cpuid_ex (int abcd[4], int eax, int ecx); // call CPUID instruction
+static inline void cpuid_abcd (int abcd[4], int eax) {
+ cpuid_ex(abcd, eax, 0);}
+
+#ifdef __cplusplus
+} // end of extern "C"
+
+// Define overloaded versions if compiling as C++
+
+static inline int Round (double x) { // Overload name Round
+ return RoundD(x);}
+static inline int Round (float x) { // Overload name Round
+ return RoundF(x);}
+static inline const char * A_strstr(const char * haystack, const char * needle) {
+ return A_strstr((char*)haystack, needle);} // Overload A_strstr with const char * version
+
+#endif // __cplusplus
+
+
+/***********************************************************************
+Function prototypes, integer division functions
+***********************************************************************/
+
+// Turn off name mangling
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void setdivisori32(int buffer[2], int d); // Set divisor for repeated division
+int dividefixedi32(const int buffer[2], int x); // Fast division with previously set divisor
+void setdivisoru32(uint32_t buffer[2], uint32_t d); // Set divisor for repeated division
+uint32_t dividefixedu32(const uint32_t buffer[2], uint32_t x); // Fast division with previously set divisor
+
+// Test if emmintrin.h is included and __m128i defined
+#if defined(__GNUC__) && defined(_EMMINTRIN_H_INCLUDED) && !defined(__SSE2__)
+#error Please compile with -sse2 or higher
+#endif
+
+#if defined(_INCLUDED_EMM) || (defined(_EMMINTRIN_H_INCLUDED) && defined(__SSE2__))
+#define VECTORDIVISIONDEFINED
+
+// define vector division functions for 16 bit signed and unsigned integers
+void setdivisorV8i16(__m128i buf[2], int16_t d); // Set divisor for repeated division
+__m128i dividefixedV8i16(const __m128i buf[2], __m128i x); // Fast division with previously set divisor
+void setdivisorV8u16(__m128i buf[2], uint16_t d); // Set divisor for repeated division
+__m128i dividefixedV8u16(const __m128i buf[2], __m128i x); // Fast division with previously set divisor
+
+// define vector division functions for 32 bit signed and unsigned integers
+void setdivisorV4i32(__m128i buf[2], int32_t d); // Set divisor for repeated division
+__m128i dividefixedV4i32(const __m128i buf[2], __m128i x); // Fast division with previously set divisor
+void setdivisorV4u32(__m128i buf[2], uint32_t d); // Set divisor for repeated division
+__m128i dividefixedV4u32(const __m128i buf[2], __m128i x); // Fast division with previously set divisor
+#endif
+
+#ifdef __cplusplus
+} // end of extern "C"
+
+// Define classes and operator '/' for fast division with fixed divisor
+class div_i32;
+class div_u32;
+static inline int32_t operator / (int32_t x, div_i32 const &D);
+static inline uint32_t operator / (uint32_t x, div_u32 const & D);
+
+class div_i32 { // Signed 32 bit integer division
+public:
+div_i32() {buffer[0] = buffer[1] = 0;} // Default constructor
+div_i32(int d) {setdivisor(d);} // Constructor with divisor
+void setdivisor(int d) {setdivisori32(buffer, d);} // Set divisor
+protected:
+ int buffer[2]; // Internal memory
+friend int32_t operator / (int32_t x, div_i32 const & D);
+};
+static inline int32_t operator / (int32_t x, div_i32 const &D){// Overloaded operator '/'
+ return dividefixedi32(D.buffer, x);}
+
+class div_u32 { // Unsigned 32 bit integer division
+public:
+div_u32() {buffer[0] = buffer[1] = 0;} // Default constructor
+div_u32(uint32_t d) {setdivisor(d);} // Constructor with divisor
+void setdivisor(uint32_t d) {setdivisoru32(buffer, d);} // Set divisor
+protected:
+ uint32_t buffer[2]; // Internal memory
+friend uint32_t operator / (uint32_t x, div_u32 const & D);
+};
+static inline uint32_t operator / (uint32_t x, div_u32 const & D) { // Overloaded operator '/'
+ return dividefixedu32(D.buffer, x);}
+
+#ifdef VECTORDIVISIONDEFINED
+// Define classes and operator '/' for fast division of vectors with fixed divisor
+class div_v8i16; // vector of 8 signed integers of 16 bits
+class div_v8u16; // vector of 8 unsigned integers of 16 bits
+class div_v4i32; // vector of 4 signed integers of 32 bits
+class div_v4u32; // vector of 4 unsigned integers of 32 bits
+static inline __m128i operator / (__m128i x, div_v8i16 const & D);
+static inline __m128i operator / (__m128i x, div_v8u16 const & D);
+static inline __m128i operator / (__m128i x, div_v4i32 const & D);
+static inline __m128i operator / (__m128i x, div_v4u32 const & D);
+
+class div_v8i16 { // vector of 8 signed integers of 16 bits
+public:
+ div_v8i16() {buffer[0] = buffer[1] = _mm_set1_epi16(0);} // default constructor
+ div_v8i16(int16_t d) {setdivisor(d);} // constructor with divisor
+ void setdivisor(int16_t d) {setdivisorV8i16(buffer, d);} // set divisor
+protected:
+ __m128i buffer[2]; // Internal memory
+friend __m128i operator / (__m128i x, div_v8i16 const & D);
+};
+static inline __m128i operator / (__m128i x, div_v8i16 const &D){// Overloaded operator '/'
+ return dividefixedV8i16(D.buffer, x);}
+
+class div_v8u16 { // vector of 8 unsigned integers of 16 bits
+public:
+ div_v8u16() {buffer[0] = buffer[1] = _mm_set1_epi16(0);} // default constructor
+ div_v8u16(uint16_t d) {setdivisor(d);} // constructor with divisor
+ void setdivisor(uint16_t d) {setdivisorV8u16(buffer, d);} // set divisor
+protected:
+ __m128i buffer[2]; // Internal memory
+friend __m128i operator / (__m128i x, div_v8u16 const & D);
+};
+static inline __m128i operator / (__m128i x, div_v8u16 const &D){// Overloaded operator '/'
+ return dividefixedV8u16(D.buffer, x);}
+
+class div_v4i32 { // vector of 4 signed integers of 32 bits
+public:
+ div_v4i32() {buffer[0] = buffer[1] = _mm_set1_epi32(0);} // default constructor
+ div_v4i32(int32_t d) {setdivisor(d);} // constructor with divisor
+ void setdivisor(int32_t d) {setdivisorV4i32(buffer, d);} // set divisor
+protected:
+ __m128i buffer[2]; // Internal memory
+friend __m128i operator / (__m128i x, div_v4i32 const & D);
+};
+static inline __m128i operator / (__m128i x, div_v4i32 const &D){// Overloaded operator '/'
+ return dividefixedV4i32(D.buffer, x);}
+
+class div_v4u32 { // vector of 4 unsigned integers of 32 bits
+public:
+ div_v4u32() {buffer[0] = buffer[1] = _mm_set1_epi32(0);} // default constructor
+ div_v4u32(uint32_t d) {setdivisor(d);} // constructor with divisor
+ void setdivisor(uint32_t d) {setdivisorV4u32(buffer, d);} // set divisor
+protected:
+ __m128i buffer[2]; // Internal memory
+friend __m128i operator / (__m128i x, div_v4u32 const & D);
+};
+static inline __m128i operator / (__m128i x, div_v4u32 const &D){// Overloaded operator '/'
+ return dividefixedV4u32(D.buffer, x);}
+
+// Support for vector classes defined in Intel's dvec.h
+#ifdef _DVEC_H_INCLUDED
+static inline Is32vec4 operator / (Is32vec4 const &x, div_v4i32 const &D){
+ return (__m128i)x / D;}
+static inline Iu32vec4 operator / (Iu32vec4 const &x, div_v4u32 const &D){
+ return (__m128i)x / D;}
+static inline Is16vec8 operator / (Is16vec8 const &x, div_v8i16 const &D){
+ return (__m128i)x / D;}
+static inline Iu16vec8 operator / (Iu16vec8 const &x, div_v8u16 const &D){
+ return (__m128i)x / D;}
+#endif // _DVEC_H_INCLUDED
+
+#endif // VECTORDIVISIONDEFINED
+
+#endif // __cplusplus
+
+#endif // ASMLIB_H
diff --git a/kmc_tools/libs/bzlib.h b/kmc_tools/libs/bzlib.h
new file mode 100644
index 0000000..8277123
--- /dev/null
+++ b/kmc_tools/libs/bzlib.h
@@ -0,0 +1,282 @@
+
+/*-------------------------------------------------------------*/
+/*--- Public header file for the library. ---*/
+/*--- bzlib.h ---*/
+/*-------------------------------------------------------------*/
+
+/* ------------------------------------------------------------------
+ This file is part of bzip2/libbzip2, a program and library for
+ lossless, block-sorting data compression.
+
+ bzip2/libbzip2 version 1.0.6 of 6 September 2010
+ Copyright (C) 1996-2010 Julian Seward <jseward at bzip.org>
+
+ Please read the WARNING, DISCLAIMER and PATENTS sections in the
+ README file.
+
+ This program is released under the terms of the license contained
+ in the file LICENSE.
+ ------------------------------------------------------------------ */
+
+
+#ifndef _BZLIB_H
+#define _BZLIB_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define BZ_RUN 0
+#define BZ_FLUSH 1
+#define BZ_FINISH 2
+
+#define BZ_OK 0
+#define BZ_RUN_OK 1
+#define BZ_FLUSH_OK 2
+#define BZ_FINISH_OK 3
+#define BZ_STREAM_END 4
+#define BZ_SEQUENCE_ERROR (-1)
+#define BZ_PARAM_ERROR (-2)
+#define BZ_MEM_ERROR (-3)
+#define BZ_DATA_ERROR (-4)
+#define BZ_DATA_ERROR_MAGIC (-5)
+#define BZ_IO_ERROR (-6)
+#define BZ_UNEXPECTED_EOF (-7)
+#define BZ_OUTBUFF_FULL (-8)
+#define BZ_CONFIG_ERROR (-9)
+
+typedef
+ struct {
+ char *next_in;
+ unsigned int avail_in;
+ unsigned int total_in_lo32;
+ unsigned int total_in_hi32;
+
+ char *next_out;
+ unsigned int avail_out;
+ unsigned int total_out_lo32;
+ unsigned int total_out_hi32;
+
+ void *state;
+
+ void *(*bzalloc)(void *,int,int);
+ void (*bzfree)(void *,void *);
+ void *opaque;
+ }
+ bz_stream;
+
+
+#ifndef BZ_IMPORT
+#define BZ_EXPORT
+#endif
+
+#ifndef BZ_NO_STDIO
+/* Need a definitition for FILE */
+#include <stdio.h>
+#endif
+
+#ifdef _WIN32
+# include <windows.h>
+# ifdef small
+ /* windows.h define small to char */
+# undef small
+# endif
+# ifdef BZ_EXPORT
+# define BZ_API(func) WINAPI func
+# define BZ_EXTERN extern
+# else
+ /* import windows dll dynamically */
+# define BZ_API(func) (WINAPI * func)
+# define BZ_EXTERN
+# endif
+#else
+# define BZ_API(func) func
+# define BZ_EXTERN extern
+#endif
+
+
+/*-- Core (low-level) library functions --*/
+
+BZ_EXTERN int BZ_API(BZ2_bzCompressInit) (
+ bz_stream* strm,
+ int blockSize100k,
+ int verbosity,
+ int workFactor
+ );
+
+BZ_EXTERN int BZ_API(BZ2_bzCompress) (
+ bz_stream* strm,
+ int action
+ );
+
+BZ_EXTERN int BZ_API(BZ2_bzCompressEnd) (
+ bz_stream* strm
+ );
+
+BZ_EXTERN int BZ_API(BZ2_bzDecompressInit) (
+ bz_stream *strm,
+ int verbosity,
+ int small
+ );
+
+BZ_EXTERN int BZ_API(BZ2_bzDecompress) (
+ bz_stream* strm
+ );
+
+BZ_EXTERN int BZ_API(BZ2_bzDecompressEnd) (
+ bz_stream *strm
+ );
+
+
+
+/*-- High(er) level library functions --*/
+
+#ifndef BZ_NO_STDIO
+#define BZ_MAX_UNUSED 5000
+
+typedef void BZFILE;
+
+BZ_EXTERN BZFILE* BZ_API(BZ2_bzReadOpen) (
+ int* bzerror,
+ FILE* f,
+ int verbosity,
+ int small,
+ void* unused,
+ int nUnused
+ );
+
+BZ_EXTERN void BZ_API(BZ2_bzReadClose) (
+ int* bzerror,
+ BZFILE* b
+ );
+
+BZ_EXTERN void BZ_API(BZ2_bzReadGetUnused) (
+ int* bzerror,
+ BZFILE* b,
+ void** unused,
+ int* nUnused
+ );
+
+BZ_EXTERN int BZ_API(BZ2_bzRead) (
+ int* bzerror,
+ BZFILE* b,
+ void* buf,
+ int len
+ );
+
+BZ_EXTERN BZFILE* BZ_API(BZ2_bzWriteOpen) (
+ int* bzerror,
+ FILE* f,
+ int blockSize100k,
+ int verbosity,
+ int workFactor
+ );
+
+BZ_EXTERN void BZ_API(BZ2_bzWrite) (
+ int* bzerror,
+ BZFILE* b,
+ void* buf,
+ int len
+ );
+
+BZ_EXTERN void BZ_API(BZ2_bzWriteClose) (
+ int* bzerror,
+ BZFILE* b,
+ int abandon,
+ unsigned int* nbytes_in,
+ unsigned int* nbytes_out
+ );
+
+BZ_EXTERN void BZ_API(BZ2_bzWriteClose64) (
+ int* bzerror,
+ BZFILE* b,
+ int abandon,
+ unsigned int* nbytes_in_lo32,
+ unsigned int* nbytes_in_hi32,
+ unsigned int* nbytes_out_lo32,
+ unsigned int* nbytes_out_hi32
+ );
+#endif
+
+
+/*-- Utility functions --*/
+
+BZ_EXTERN int BZ_API(BZ2_bzBuffToBuffCompress) (
+ char* dest,
+ unsigned int* destLen,
+ char* source,
+ unsigned int sourceLen,
+ int blockSize100k,
+ int verbosity,
+ int workFactor
+ );
+
+BZ_EXTERN int BZ_API(BZ2_bzBuffToBuffDecompress) (
+ char* dest,
+ unsigned int* destLen,
+ char* source,
+ unsigned int sourceLen,
+ int small,
+ int verbosity
+ );
+
+
+/*--
+ Code contributed by Yoshioka Tsuneo (tsuneo at rr.iij4u.or.jp)
+ to support better zlib compatibility.
+ This code is not _officially_ part of libbzip2 (yet);
+ I haven't tested it, documented it, or considered the
+ threading-safeness of it.
+ If this code breaks, please contact both Yoshioka and me.
+--*/
+
+BZ_EXTERN const char * BZ_API(BZ2_bzlibVersion) (
+ void
+ );
+
+#ifndef BZ_NO_STDIO
+BZ_EXTERN BZFILE * BZ_API(BZ2_bzopen) (
+ const char *path,
+ const char *mode
+ );
+
+BZ_EXTERN BZFILE * BZ_API(BZ2_bzdopen) (
+ int fd,
+ const char *mode
+ );
+
+BZ_EXTERN int BZ_API(BZ2_bzread) (
+ BZFILE* b,
+ void* buf,
+ int len
+ );
+
+BZ_EXTERN int BZ_API(BZ2_bzwrite) (
+ BZFILE* b,
+ void* buf,
+ int len
+ );
+
+BZ_EXTERN int BZ_API(BZ2_bzflush) (
+ BZFILE* b
+ );
+
+BZ_EXTERN void BZ_API(BZ2_bzclose) (
+ BZFILE* b
+ );
+
+BZ_EXTERN const char * BZ_API(BZ2_bzerror) (
+ BZFILE *b,
+ int *errnum
+ );
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+/*-------------------------------------------------------------*/
+/*--- end bzlib.h ---*/
+/*-------------------------------------------------------------*/
diff --git a/kmc_tools/libs/bzlib_private.h b/kmc_tools/libs/bzlib_private.h
new file mode 100644
index 0000000..5d0217f
--- /dev/null
+++ b/kmc_tools/libs/bzlib_private.h
@@ -0,0 +1,509 @@
+
+/*-------------------------------------------------------------*/
+/*--- Private header file for the library. ---*/
+/*--- bzlib_private.h ---*/
+/*-------------------------------------------------------------*/
+
+/* ------------------------------------------------------------------
+ This file is part of bzip2/libbzip2, a program and library for
+ lossless, block-sorting data compression.
+
+ bzip2/libbzip2 version 1.0.6 of 6 September 2010
+ Copyright (C) 1996-2010 Julian Seward <jseward at bzip.org>
+
+ Please read the WARNING, DISCLAIMER and PATENTS sections in the
+ README file.
+
+ This program is released under the terms of the license contained
+ in the file LICENSE.
+ ------------------------------------------------------------------ */
+
+
+#ifndef _BZLIB_PRIVATE_H
+#define _BZLIB_PRIVATE_H
+
+#include <stdlib.h>
+
+#ifndef BZ_NO_STDIO
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#endif
+
+#include "bzlib.h"
+
+
+
+/*-- General stuff. --*/
+
+#define BZ_VERSION "1.0.6, 6-Sept-2010"
+
+typedef char Char;
+typedef unsigned char Bool;
+typedef unsigned char UChar;
+typedef int Int32;
+typedef unsigned int UInt32;
+typedef short Int16;
+typedef unsigned short UInt16;
+
+#define True ((Bool)1)
+#define False ((Bool)0)
+
+#ifndef __GNUC__
+#define __inline__ /* */
+#endif
+
+#ifndef BZ_NO_STDIO
+
+extern void BZ2_bz__AssertH__fail ( int errcode );
+#define AssertH(cond,errcode) \
+ { if (!(cond)) BZ2_bz__AssertH__fail ( errcode ); }
+
+#if BZ_DEBUG
+#define AssertD(cond,msg) \
+ { if (!(cond)) { \
+ fprintf ( stderr, \
+ "\n\nlibbzip2(debug build): internal error\n\t%s\n", msg );\
+ exit(1); \
+ }}
+#else
+#define AssertD(cond,msg) /* */
+#endif
+
+#define VPrintf0(zf) \
+ fprintf(stderr,zf)
+#define VPrintf1(zf,za1) \
+ fprintf(stderr,zf,za1)
+#define VPrintf2(zf,za1,za2) \
+ fprintf(stderr,zf,za1,za2)
+#define VPrintf3(zf,za1,za2,za3) \
+ fprintf(stderr,zf,za1,za2,za3)
+#define VPrintf4(zf,za1,za2,za3,za4) \
+ fprintf(stderr,zf,za1,za2,za3,za4)
+#define VPrintf5(zf,za1,za2,za3,za4,za5) \
+ fprintf(stderr,zf,za1,za2,za3,za4,za5)
+
+#else
+
+extern void bz_internal_error ( int errcode );
+#define AssertH(cond,errcode) \
+ { if (!(cond)) bz_internal_error ( errcode ); }
+#define AssertD(cond,msg) do { } while (0)
+#define VPrintf0(zf) do { } while (0)
+#define VPrintf1(zf,za1) do { } while (0)
+#define VPrintf2(zf,za1,za2) do { } while (0)
+#define VPrintf3(zf,za1,za2,za3) do { } while (0)
+#define VPrintf4(zf,za1,za2,za3,za4) do { } while (0)
+#define VPrintf5(zf,za1,za2,za3,za4,za5) do { } while (0)
+
+#endif
+
+
+#define BZALLOC(nnn) (strm->bzalloc)(strm->opaque,(nnn),1)
+#define BZFREE(ppp) (strm->bzfree)(strm->opaque,(ppp))
+
+
+/*-- Header bytes. --*/
+
+#define BZ_HDR_B 0x42 /* 'B' */
+#define BZ_HDR_Z 0x5a /* 'Z' */
+#define BZ_HDR_h 0x68 /* 'h' */
+#define BZ_HDR_0 0x30 /* '0' */
+
+/*-- Constants for the back end. --*/
+
+#define BZ_MAX_ALPHA_SIZE 258
+#define BZ_MAX_CODE_LEN 23
+
+#define BZ_RUNA 0
+#define BZ_RUNB 1
+
+#define BZ_N_GROUPS 6
+#define BZ_G_SIZE 50
+#define BZ_N_ITERS 4
+
+#define BZ_MAX_SELECTORS (2 + (900000 / BZ_G_SIZE))
+
+
+
+/*-- Stuff for randomising repetitive blocks. --*/
+
+extern Int32 BZ2_rNums[512];
+
+#define BZ_RAND_DECLS \
+ Int32 rNToGo; \
+ Int32 rTPos \
+
+#define BZ_RAND_INIT_MASK \
+ s->rNToGo = 0; \
+ s->rTPos = 0 \
+
+#define BZ_RAND_MASK ((s->rNToGo == 1) ? 1 : 0)
+
+#define BZ_RAND_UPD_MASK \
+ if (s->rNToGo == 0) { \
+ s->rNToGo = BZ2_rNums[s->rTPos]; \
+ s->rTPos++; \
+ if (s->rTPos == 512) s->rTPos = 0; \
+ } \
+ s->rNToGo--;
+
+
+
+/*-- Stuff for doing CRCs. --*/
+
+extern UInt32 BZ2_crc32Table[256];
+
+#define BZ_INITIALISE_CRC(crcVar) \
+{ \
+ crcVar = 0xffffffffL; \
+}
+
+#define BZ_FINALISE_CRC(crcVar) \
+{ \
+ crcVar = ~(crcVar); \
+}
+
+#define BZ_UPDATE_CRC(crcVar,cha) \
+{ \
+ crcVar = (crcVar << 8) ^ \
+ BZ2_crc32Table[(crcVar >> 24) ^ \
+ ((UChar)cha)]; \
+}
+
+
+
+/*-- States and modes for compression. --*/
+
+#define BZ_M_IDLE 1
+#define BZ_M_RUNNING 2
+#define BZ_M_FLUSHING 3
+#define BZ_M_FINISHING 4
+
+#define BZ_S_OUTPUT 1
+#define BZ_S_INPUT 2
+
+#define BZ_N_RADIX 2
+#define BZ_N_QSORT 12
+#define BZ_N_SHELL 18
+#define BZ_N_OVERSHOOT (BZ_N_RADIX + BZ_N_QSORT + BZ_N_SHELL + 2)
+
+
+
+
+/*-- Structure holding all the compression-side stuff. --*/
+
+typedef
+ struct {
+ /* pointer back to the struct bz_stream */
+ bz_stream* strm;
+
+ /* mode this stream is in, and whether inputting */
+ /* or outputting data */
+ Int32 mode;
+ Int32 state;
+
+ /* remembers avail_in when flush/finish requested */
+ UInt32 avail_in_expect;
+
+ /* for doing the block sorting */
+ UInt32* arr1;
+ UInt32* arr2;
+ UInt32* ftab;
+ Int32 origPtr;
+
+ /* aliases for arr1 and arr2 */
+ UInt32* ptr;
+ UChar* block;
+ UInt16* mtfv;
+ UChar* zbits;
+
+ /* for deciding when to use the fallback sorting algorithm */
+ Int32 workFactor;
+
+ /* run-length-encoding of the input */
+ UInt32 state_in_ch;
+ Int32 state_in_len;
+ BZ_RAND_DECLS;
+
+ /* input and output limits and current posns */
+ Int32 nblock;
+ Int32 nblockMAX;
+ Int32 numZ;
+ Int32 state_out_pos;
+
+ /* map of bytes used in block */
+ Int32 nInUse;
+ Bool inUse[256];
+ UChar unseqToSeq[256];
+
+ /* the buffer for bit stream creation */
+ UInt32 bsBuff;
+ Int32 bsLive;
+
+ /* block and combined CRCs */
+ UInt32 blockCRC;
+ UInt32 combinedCRC;
+
+ /* misc administratium */
+ Int32 verbosity;
+ Int32 blockNo;
+ Int32 blockSize100k;
+
+ /* stuff for coding the MTF values */
+ Int32 nMTF;
+ Int32 mtfFreq [BZ_MAX_ALPHA_SIZE];
+ UChar selector [BZ_MAX_SELECTORS];
+ UChar selectorMtf[BZ_MAX_SELECTORS];
+
+ UChar len [BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
+ Int32 code [BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
+ Int32 rfreq [BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
+ /* second dimension: only 3 needed; 4 makes index calculations faster */
+ UInt32 len_pack[BZ_MAX_ALPHA_SIZE][4];
+
+ }
+ EState;
+
+
+
+/*-- externs for compression. --*/
+
+extern void
+BZ2_blockSort ( EState* );
+
+extern void
+BZ2_compressBlock ( EState*, Bool );
+
+extern void
+BZ2_bsInitWrite ( EState* );
+
+extern void
+BZ2_hbAssignCodes ( Int32*, UChar*, Int32, Int32, Int32 );
+
+extern void
+BZ2_hbMakeCodeLengths ( UChar*, Int32*, Int32, Int32 );
+
+
+
+/*-- states for decompression. --*/
+
+#define BZ_X_IDLE 1
+#define BZ_X_OUTPUT 2
+
+#define BZ_X_MAGIC_1 10
+#define BZ_X_MAGIC_2 11
+#define BZ_X_MAGIC_3 12
+#define BZ_X_MAGIC_4 13
+#define BZ_X_BLKHDR_1 14
+#define BZ_X_BLKHDR_2 15
+#define BZ_X_BLKHDR_3 16
+#define BZ_X_BLKHDR_4 17
+#define BZ_X_BLKHDR_5 18
+#define BZ_X_BLKHDR_6 19
+#define BZ_X_BCRC_1 20
+#define BZ_X_BCRC_2 21
+#define BZ_X_BCRC_3 22
+#define BZ_X_BCRC_4 23
+#define BZ_X_RANDBIT 24
+#define BZ_X_ORIGPTR_1 25
+#define BZ_X_ORIGPTR_2 26
+#define BZ_X_ORIGPTR_3 27
+#define BZ_X_MAPPING_1 28
+#define BZ_X_MAPPING_2 29
+#define BZ_X_SELECTOR_1 30
+#define BZ_X_SELECTOR_2 31
+#define BZ_X_SELECTOR_3 32
+#define BZ_X_CODING_1 33
+#define BZ_X_CODING_2 34
+#define BZ_X_CODING_3 35
+#define BZ_X_MTF_1 36
+#define BZ_X_MTF_2 37
+#define BZ_X_MTF_3 38
+#define BZ_X_MTF_4 39
+#define BZ_X_MTF_5 40
+#define BZ_X_MTF_6 41
+#define BZ_X_ENDHDR_2 42
+#define BZ_X_ENDHDR_3 43
+#define BZ_X_ENDHDR_4 44
+#define BZ_X_ENDHDR_5 45
+#define BZ_X_ENDHDR_6 46
+#define BZ_X_CCRC_1 47
+#define BZ_X_CCRC_2 48
+#define BZ_X_CCRC_3 49
+#define BZ_X_CCRC_4 50
+
+
+
+/*-- Constants for the fast MTF decoder. --*/
+
+#define MTFA_SIZE 4096
+#define MTFL_SIZE 16
+
+
+
+/*-- Structure holding all the decompression-side stuff. --*/
+
+typedef
+ struct {
+ /* pointer back to the struct bz_stream */
+ bz_stream* strm;
+
+ /* state indicator for this stream */
+ Int32 state;
+
+ /* for doing the final run-length decoding */
+ UChar state_out_ch;
+ Int32 state_out_len;
+ Bool blockRandomised;
+ BZ_RAND_DECLS;
+
+ /* the buffer for bit stream reading */
+ UInt32 bsBuff;
+ Int32 bsLive;
+
+ /* misc administratium */
+ Int32 blockSize100k;
+ Bool smallDecompress;
+ Int32 currBlockNo;
+ Int32 verbosity;
+
+ /* for undoing the Burrows-Wheeler transform */
+ Int32 origPtr;
+ UInt32 tPos;
+ Int32 k0;
+ Int32 unzftab[256];
+ Int32 nblock_used;
+ Int32 cftab[257];
+ Int32 cftabCopy[257];
+
+ /* for undoing the Burrows-Wheeler transform (FAST) */
+ UInt32 *tt;
+
+ /* for undoing the Burrows-Wheeler transform (SMALL) */
+ UInt16 *ll16;
+ UChar *ll4;
+
+ /* stored and calculated CRCs */
+ UInt32 storedBlockCRC;
+ UInt32 storedCombinedCRC;
+ UInt32 calculatedBlockCRC;
+ UInt32 calculatedCombinedCRC;
+
+ /* map of bytes used in block */
+ Int32 nInUse;
+ Bool inUse[256];
+ Bool inUse16[16];
+ UChar seqToUnseq[256];
+
+ /* for decoding the MTF values */
+ UChar mtfa [MTFA_SIZE];
+ Int32 mtfbase[256 / MTFL_SIZE];
+ UChar selector [BZ_MAX_SELECTORS];
+ UChar selectorMtf[BZ_MAX_SELECTORS];
+ UChar len [BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
+
+ Int32 limit [BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
+ Int32 base [BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
+ Int32 perm [BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
+ Int32 minLens[BZ_N_GROUPS];
+
+ /* save area for scalars in the main decompress code */
+ Int32 save_i;
+ Int32 save_j;
+ Int32 save_t;
+ Int32 save_alphaSize;
+ Int32 save_nGroups;
+ Int32 save_nSelectors;
+ Int32 save_EOB;
+ Int32 save_groupNo;
+ Int32 save_groupPos;
+ Int32 save_nextSym;
+ Int32 save_nblockMAX;
+ Int32 save_nblock;
+ Int32 save_es;
+ Int32 save_N;
+ Int32 save_curr;
+ Int32 save_zt;
+ Int32 save_zn;
+ Int32 save_zvec;
+ Int32 save_zj;
+ Int32 save_gSel;
+ Int32 save_gMinlen;
+ Int32* save_gLimit;
+ Int32* save_gBase;
+ Int32* save_gPerm;
+
+ }
+ DState;
+
+
+
+/*-- Macros for decompression. --*/
+
+#define BZ_GET_FAST(cccc) \
+ /* c_tPos is unsigned, hence test < 0 is pointless. */ \
+ if (s->tPos >= (UInt32)100000 * (UInt32)s->blockSize100k) return True; \
+ s->tPos = s->tt[s->tPos]; \
+ cccc = (UChar)(s->tPos & 0xff); \
+ s->tPos >>= 8;
+
+#define BZ_GET_FAST_C(cccc) \
+ /* c_tPos is unsigned, hence test < 0 is pointless. */ \
+ if (c_tPos >= (UInt32)100000 * (UInt32)ro_blockSize100k) return True; \
+ c_tPos = c_tt[c_tPos]; \
+ cccc = (UChar)(c_tPos & 0xff); \
+ c_tPos >>= 8;
+
+#define SET_LL4(i,n) \
+ { if (((i) & 0x1) == 0) \
+ s->ll4[(i) >> 1] = (s->ll4[(i) >> 1] & 0xf0) | (n); else \
+ s->ll4[(i) >> 1] = (s->ll4[(i) >> 1] & 0x0f) | ((n) << 4); \
+ }
+
+#define GET_LL4(i) \
+ ((((UInt32)(s->ll4[(i) >> 1])) >> (((i) << 2) & 0x4)) & 0xF)
+
+#define SET_LL(i,n) \
+ { s->ll16[i] = (UInt16)(n & 0x0000ffff); \
+ SET_LL4(i, n >> 16); \
+ }
+
+#define GET_LL(i) \
+ (((UInt32)s->ll16[i]) | (GET_LL4(i) << 16))
+
+#define BZ_GET_SMALL(cccc) \
+ /* c_tPos is unsigned, hence test < 0 is pointless. */ \
+ if (s->tPos >= (UInt32)100000 * (UInt32)s->blockSize100k) return True; \
+ cccc = BZ2_indexIntoF ( s->tPos, s->cftab ); \
+ s->tPos = GET_LL(s->tPos);
+
+
+/*-- externs for decompression. --*/
+
+extern Int32
+BZ2_indexIntoF ( Int32, Int32* );
+
+extern Int32
+BZ2_decompress ( DState* );
+
+extern void
+BZ2_hbCreateDecodeTables ( Int32*, Int32*, Int32*, UChar*,
+ Int32, Int32, Int32 );
+
+
+#endif
+
+
+/*-- BZ_NO_STDIO seems to make NULL disappear on some platforms. --*/
+
+#ifdef BZ_NO_STDIO
+#ifndef NULL
+#define NULL 0
+#endif
+#endif
+
+
+/*-------------------------------------------------------------*/
+/*--- end bzlib_private.h ---*/
+/*-------------------------------------------------------------*/
diff --git a/kmc_tools/libs/libamac64.a b/kmc_tools/libs/libamac64.a
new file mode 100644
index 0000000..4d0fb0b
Binary files /dev/null and b/kmc_tools/libs/libamac64.a differ
diff --git a/kmc_tools/libs/libbz2.1.0.5.dylib b/kmc_tools/libs/libbz2.1.0.5.dylib
new file mode 100644
index 0000000..ffb0cee
Binary files /dev/null and b/kmc_tools/libs/libbz2.1.0.5.dylib differ
diff --git a/kmc_tools/libs/libbz2.a b/kmc_tools/libs/libbz2.a
new file mode 100644
index 0000000..2c4f8b2
Binary files /dev/null and b/kmc_tools/libs/libbz2.a differ
diff --git a/kmc_tools/libs/libbzip2.lib b/kmc_tools/libs/libbzip2.lib
new file mode 100644
index 0000000..66d3e4f
Binary files /dev/null and b/kmc_tools/libs/libbzip2.lib differ
diff --git a/kmc_tools/libs/libz.1.2.5.dylib b/kmc_tools/libs/libz.1.2.5.dylib
new file mode 100644
index 0000000..0293aef
Binary files /dev/null and b/kmc_tools/libs/libz.1.2.5.dylib differ
diff --git a/kmc_tools/libs/libz.a b/kmc_tools/libs/libz.a
new file mode 100644
index 0000000..b57857b
Binary files /dev/null and b/kmc_tools/libs/libz.a differ
diff --git a/kmc_tools/libs/zconf.h b/kmc_tools/libs/zconf.h
new file mode 100644
index 0000000..8a46a58
--- /dev/null
+++ b/kmc_tools/libs/zconf.h
@@ -0,0 +1,506 @@
+/* zconf.h -- configuration of the zlib compression library
+ * Copyright (C) 1995-2012 Jean-loup Gailly.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* @(#) $Id$ */
+
+#ifndef ZCONF_H
+#define ZCONF_H
+
+/*
+ * If you *really* need a unique prefix for all types and library functions,
+ * compile with -DZ_PREFIX. The "standard" zlib should be compiled without it.
+ * Even better than compiling with -DZ_PREFIX would be to use configure to set
+ * this permanently in zconf.h using "./configure --zprefix".
+ */
+#ifdef Z_PREFIX /* may be set to #if 1 by ./configure */
+# define Z_PREFIX_SET
+
+/* all linked symbols */
+# define _dist_code z__dist_code
+# define _length_code z__length_code
+# define _tr_align z__tr_align
+# define _tr_flush_block z__tr_flush_block
+# define _tr_init z__tr_init
+# define _tr_stored_block z__tr_stored_block
+# define _tr_tally z__tr_tally
+# define adler32 z_adler32
+# define adler32_combine z_adler32_combine
+# define adler32_combine64 z_adler32_combine64
+# ifndef Z_SOLO
+# define compress z_compress
+# define compress2 z_compress2
+# define compressBound z_compressBound
+# endif
+# define crc32 z_crc32
+# define crc32_combine z_crc32_combine
+# define crc32_combine64 z_crc32_combine64
+# define deflate z_deflate
+# define deflateBound z_deflateBound
+# define deflateCopy z_deflateCopy
+# define deflateEnd z_deflateEnd
+# define deflateInit2_ z_deflateInit2_
+# define deflateInit_ z_deflateInit_
+# define deflateParams z_deflateParams
+# define deflatePending z_deflatePending
+# define deflatePrime z_deflatePrime
+# define deflateReset z_deflateReset
+# define deflateResetKeep z_deflateResetKeep
+# define deflateSetDictionary z_deflateSetDictionary
+# define deflateSetHeader z_deflateSetHeader
+# define deflateTune z_deflateTune
+# define deflate_copyright z_deflate_copyright
+# define get_crc_table z_get_crc_table
+# ifndef Z_SOLO
+# define gz_error z_gz_error
+# define gz_intmax z_gz_intmax
+# define gz_strwinerror z_gz_strwinerror
+# define gzbuffer z_gzbuffer
+# define gzclearerr z_gzclearerr
+# define gzclose z_gzclose
+# define gzclose_r z_gzclose_r
+# define gzclose_w z_gzclose_w
+# define gzdirect z_gzdirect
+# define gzdopen z_gzdopen
+# define gzeof z_gzeof
+# define gzerror z_gzerror
+# define gzflush z_gzflush
+# define gzgetc z_gzgetc
+# define gzgetc_ z_gzgetc_
+# define gzgets z_gzgets
+# define gzoffset z_gzoffset
+# define gzoffset64 z_gzoffset64
+# define gzopen z_gzopen
+# define gzopen64 z_gzopen64
+# ifdef _WIN32
+# define gzopen_w z_gzopen_w
+# endif
+# define gzprintf z_gzprintf
+# define gzputc z_gzputc
+# define gzputs z_gzputs
+# define gzread z_gzread
+# define gzrewind z_gzrewind
+# define gzseek z_gzseek
+# define gzseek64 z_gzseek64
+# define gzsetparams z_gzsetparams
+# define gztell z_gztell
+# define gztell64 z_gztell64
+# define gzungetc z_gzungetc
+# define gzwrite z_gzwrite
+# endif
+# define inflate z_inflate
+# define inflateBack z_inflateBack
+# define inflateBackEnd z_inflateBackEnd
+# define inflateBackInit_ z_inflateBackInit_
+# define inflateCopy z_inflateCopy
+# define inflateEnd z_inflateEnd
+# define inflateGetHeader z_inflateGetHeader
+# define inflateInit2_ z_inflateInit2_
+# define inflateInit_ z_inflateInit_
+# define inflateMark z_inflateMark
+# define inflatePrime z_inflatePrime
+# define inflateReset z_inflateReset
+# define inflateReset2 z_inflateReset2
+# define inflateSetDictionary z_inflateSetDictionary
+# define inflateSync z_inflateSync
+# define inflateSyncPoint z_inflateSyncPoint
+# define inflateUndermine z_inflateUndermine
+# define inflateResetKeep z_inflateResetKeep
+# define inflate_copyright z_inflate_copyright
+# define inflate_fast z_inflate_fast
+# define inflate_table z_inflate_table
+# ifndef Z_SOLO
+# define uncompress z_uncompress
+# endif
+# define zError z_zError
+# ifndef Z_SOLO
+# define zcalloc z_zcalloc
+# define zcfree z_zcfree
+# endif
+# define zlibCompileFlags z_zlibCompileFlags
+# define zlibVersion z_zlibVersion
+
+/* all zlib typedefs in zlib.h and zconf.h */
+# define Byte z_Byte
+# define Bytef z_Bytef
+# define alloc_func z_alloc_func
+# define charf z_charf
+# define free_func z_free_func
+# ifndef Z_SOLO
+# define gzFile z_gzFile
+# endif
+# define gz_header z_gz_header
+# define gz_headerp z_gz_headerp
+# define in_func z_in_func
+# define intf z_intf
+# define out_func z_out_func
+# define uInt z_uInt
+# define uIntf z_uIntf
+# define uLong z_uLong
+# define uLongf z_uLongf
+# define voidp z_voidp
+# define voidpc z_voidpc
+# define voidpf z_voidpf
+
+/* all zlib structs in zlib.h and zconf.h */
+# define gz_header_s z_gz_header_s
+# define internal_state z_internal_state
+
+#endif
+
+#if defined(__MSDOS__) && !defined(MSDOS)
+# define MSDOS
+#endif
+#if (defined(OS_2) || defined(__OS2__)) && !defined(OS2)
+# define OS2
+#endif
+#if defined(_WINDOWS) && !defined(WINDOWS)
+# define WINDOWS
+#endif
+#if defined(_WIN32) || defined(_WIN32_WCE) || defined(__WIN32__)
+# ifndef WIN32
+# define WIN32
+# endif
+#endif
+#if (defined(MSDOS) || defined(OS2) || defined(WINDOWS)) && !defined(WIN32)
+# if !defined(__GNUC__) && !defined(__FLAT__) && !defined(__386__)
+# ifndef SYS16BIT
+# define SYS16BIT
+# endif
+# endif
+#endif
+
+/*
+ * Compile with -DMAXSEG_64K if the alloc function cannot allocate more
+ * than 64k bytes at a time (needed on systems with 16-bit int).
+ */
+#ifdef SYS16BIT
+# define MAXSEG_64K
+#endif
+#ifdef MSDOS
+# define UNALIGNED_OK
+#endif
+
+#ifdef __STDC_VERSION__
+# ifndef STDC
+# define STDC
+# endif
+# if __STDC_VERSION__ >= 199901L
+# ifndef STDC99
+# define STDC99
+# endif
+# endif
+#endif
+#if !defined(STDC) && (defined(__STDC__) || defined(__cplusplus))
+# define STDC
+#endif
+#if !defined(STDC) && (defined(__GNUC__) || defined(__BORLANDC__))
+# define STDC
+#endif
+#if !defined(STDC) && (defined(MSDOS) || defined(WINDOWS) || defined(WIN32))
+# define STDC
+#endif
+#if !defined(STDC) && (defined(OS2) || defined(__HOS_AIX__))
+# define STDC
+#endif
+
+#if defined(__OS400__) && !defined(STDC) /* iSeries (formerly AS/400). */
+# define STDC
+#endif
+
+#ifndef STDC
+# ifndef const /* cannot use !defined(STDC) && !defined(const) on Mac */
+# define const /* note: need a more gentle solution here */
+# endif
+#endif
+
+#if defined(ZLIB_CONST) && !defined(z_const)
+# define z_const const
+#else
+# define z_const
+#endif
+
+/* Some Mac compilers merge all .h files incorrectly: */
+#if defined(__MWERKS__)||defined(applec)||defined(THINK_C)||defined(__SC__)
+# define NO_DUMMY_DECL
+#endif
+
+/* Maximum value for memLevel in deflateInit2 */
+#ifndef MAX_MEM_LEVEL
+# ifdef MAXSEG_64K
+# define MAX_MEM_LEVEL 8
+# else
+# define MAX_MEM_LEVEL 9
+# endif
+#endif
+
+/* Maximum value for windowBits in deflateInit2 and inflateInit2.
+ * WARNING: reducing MAX_WBITS makes minigzip unable to extract .gz files
+ * created by gzip. (Files created by minigzip can still be extracted by
+ * gzip.)
+ */
+#ifndef MAX_WBITS
+# define MAX_WBITS 15 /* 32K LZ77 window */
+#endif
+
+/* The memory requirements for deflate are (in bytes):
+ (1 << (windowBits+2)) + (1 << (memLevel+9))
+ that is: 128K for windowBits=15 + 128K for memLevel = 8 (default values)
+ plus a few kilobytes for small objects. For example, if you want to reduce
+ the default memory requirements from 256K to 128K, compile with
+ make CFLAGS="-O -DMAX_WBITS=14 -DMAX_MEM_LEVEL=7"
+ Of course this will generally degrade compression (there's no free lunch).
+
+ The memory requirements for inflate are (in bytes) 1 << windowBits
+ that is, 32K for windowBits=15 (default value) plus a few kilobytes
+ for small objects.
+*/
+
+ /* Type declarations */
+
+#ifndef OF /* function prototypes */
+# ifdef STDC
+# define OF(args) args
+# else
+# define OF(args) ()
+# endif
+#endif
+
+#ifndef Z_ARG /* function prototypes for stdarg */
+# if defined(STDC) || defined(Z_HAVE_STDARG_H)
+# define Z_ARG(args) args
+# else
+# define Z_ARG(args) ()
+# endif
+#endif
+
+/* The following definitions for FAR are needed only for MSDOS mixed
+ * model programming (small or medium model with some far allocations).
+ * This was tested only with MSC; for other MSDOS compilers you may have
+ * to define NO_MEMCPY in zutil.h. If you don't need the mixed model,
+ * just define FAR to be empty.
+ */
+#ifdef SYS16BIT
+# if defined(M_I86SM) || defined(M_I86MM)
+ /* MSC small or medium model */
+# define SMALL_MEDIUM
+# ifdef _MSC_VER
+# define FAR _far
+# else
+# define FAR far
+# endif
+# endif
+# if (defined(__SMALL__) || defined(__MEDIUM__))
+ /* Turbo C small or medium model */
+# define SMALL_MEDIUM
+# ifdef __BORLANDC__
+# define FAR _far
+# else
+# define FAR far
+# endif
+# endif
+#endif
+
+#if defined(WINDOWS) || defined(WIN32)
+ /* If building or using zlib as a DLL, define ZLIB_DLL.
+ * This is not mandatory, but it offers a little performance increase.
+ */
+# ifdef ZLIB_DLL
+# if defined(WIN32) && (!defined(__BORLANDC__) || (__BORLANDC__ >= 0x500))
+# ifdef ZLIB_INTERNAL
+# define ZEXTERN extern __declspec(dllexport)
+# else
+# define ZEXTERN extern __declspec(dllimport)
+# endif
+# endif
+# endif /* ZLIB_DLL */
+ /* If building or using zlib with the WINAPI/WINAPIV calling convention,
+ * define ZLIB_WINAPI.
+ * Caution: the standard ZLIB1.DLL is NOT compiled using ZLIB_WINAPI.
+ */
+# ifdef ZLIB_WINAPI
+# ifdef FAR
+# undef FAR
+# endif
+# include <windows.h>
+ /* No need for _export, use ZLIB.DEF instead. */
+ /* For complete Windows compatibility, use WINAPI, not __stdcall. */
+# define ZEXPORT WINAPI
+# ifdef WIN32
+# define ZEXPORTVA WINAPIV
+# else
+# define ZEXPORTVA FAR CDECL
+# endif
+# endif
+#endif
+
+#if defined (__BEOS__)
+# ifdef ZLIB_DLL
+# ifdef ZLIB_INTERNAL
+# define ZEXPORT __declspec(dllexport)
+# define ZEXPORTVA __declspec(dllexport)
+# else
+# define ZEXPORT __declspec(dllimport)
+# define ZEXPORTVA __declspec(dllimport)
+# endif
+# endif
+#endif
+
+#ifndef ZEXTERN
+# define ZEXTERN extern
+#endif
+#ifndef ZEXPORT
+# define ZEXPORT
+#endif
+#ifndef ZEXPORTVA
+# define ZEXPORTVA
+#endif
+
+#ifndef FAR
+# define FAR
+#endif
+
+#if !defined(__MACTYPES__)
+typedef unsigned char Byte; /* 8 bits */
+#endif
+typedef unsigned int uInt; /* 16 bits or more */
+typedef unsigned long uLong; /* 32 bits or more */
+
+#ifdef SMALL_MEDIUM
+ /* Borland C/C++ and some old MSC versions ignore FAR inside typedef */
+# define Bytef Byte FAR
+#else
+ typedef Byte FAR Bytef;
+#endif
+typedef char FAR charf;
+typedef int FAR intf;
+typedef uInt FAR uIntf;
+typedef uLong FAR uLongf;
+
+#ifdef STDC
+ typedef void const *voidpc;
+ typedef void FAR *voidpf;
+ typedef void *voidp;
+#else
+ typedef Byte const *voidpc;
+ typedef Byte FAR *voidpf;
+ typedef Byte *voidp;
+#endif
+
+/* ./configure may #define Z_U4 here */
+
+#if !defined(Z_U4) && !defined(Z_SOLO) && defined(STDC)
+# include <limits.h>
+# if (UINT_MAX == 0xffffffffUL)
+# define Z_U4 unsigned
+# else
+# if (ULONG_MAX == 0xffffffffUL)
+# define Z_U4 unsigned long
+# else
+# if (USHRT_MAX == 0xffffffffUL)
+# define Z_U4 unsigned short
+# endif
+# endif
+# endif
+#endif
+
+#ifdef Z_U4
+ typedef Z_U4 z_crc_t;
+#else
+ typedef unsigned long z_crc_t;
+#endif
+
+#ifdef HAVE_UNISTD_H /* may be set to #if 1 by ./configure */
+# define Z_HAVE_UNISTD_H
+#endif
+
+#ifdef HAVE_STDARG_H /* may be set to #if 1 by ./configure */
+# define Z_HAVE_STDARG_H
+#endif
+
+#ifdef STDC
+# ifndef Z_SOLO
+# include <sys/types.h> /* for off_t */
+# endif
+#endif
+
+#ifdef _WIN32
+# include <stddef.h> /* for wchar_t */
+#endif
+
+/* a little trick to accommodate both "#define _LARGEFILE64_SOURCE" and
+ * "#define _LARGEFILE64_SOURCE 1" as requesting 64-bit operations, (even
+ * though the former does not conform to the LFS document), but considering
+ * both "#undef _LARGEFILE64_SOURCE" and "#define _LARGEFILE64_SOURCE 0" as
+ * equivalently requesting no 64-bit operations
+ */
+#if defined(LARGEFILE64_SOURCE) && -_LARGEFILE64_SOURCE - -1 == 1
+# undef _LARGEFILE64_SOURCE
+#endif
+
+#if defined(__WATCOMC__) && !defined(Z_HAVE_UNISTD_H)
+# define Z_HAVE_UNISTD_H
+#endif
+#ifndef Z_SOLO
+# if defined(Z_HAVE_UNISTD_H) || defined(LARGEFILE64_SOURCE)
+# include <unistd.h> /* for SEEK_*, off_t, and _LFS64_LARGEFILE */
+# ifdef VMS
+# include <unixio.h> /* for off_t */
+# endif
+# ifndef z_off_t
+# define z_off_t off_t
+# endif
+# endif
+#endif
+
+#if defined(_LFS64_LARGEFILE) && _LFS64_LARGEFILE-0
+# define Z_LFS64
+#endif
+
+#if defined(_LARGEFILE64_SOURCE) && defined(Z_LFS64)
+# define Z_LARGE64
+#endif
+
+#if defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS-0 == 64 && defined(Z_LFS64)
+# define Z_WANT64
+#endif
+
+#if !defined(SEEK_SET) && !defined(Z_SOLO)
+# define SEEK_SET 0 /* Seek from beginning of file. */
+# define SEEK_CUR 1 /* Seek from current position. */
+# define SEEK_END 2 /* Set file pointer to EOF plus "offset" */
+#endif
+
+#ifndef z_off_t
+# define z_off_t long
+#endif
+
+#if !defined(_WIN32) && defined(Z_LARGE64)
+# define z_off64_t off64_t
+#else
+# if defined(_WIN32) && !defined(__GNUC__) && !defined(Z_SOLO)
+# define z_off64_t __int64
+# else
+# define z_off64_t z_off_t
+# endif
+#endif
+
+/* MVS linker does not support external names larger than 8 bytes */
+#if defined(__MVS__)
+ #pragma map(deflateInit_,"DEIN")
+ #pragma map(deflateInit2_,"DEIN2")
+ #pragma map(deflateEnd,"DEEND")
+ #pragma map(deflateBound,"DEBND")
+ #pragma map(inflateInit_,"ININ")
+ #pragma map(inflateInit2_,"ININ2")
+ #pragma map(inflateEnd,"INEND")
+ #pragma map(inflateSync,"INSY")
+ #pragma map(inflateSetDictionary,"INSEDI")
+ #pragma map(compressBound,"CMBND")
+ #pragma map(inflate_table,"INTABL")
+ #pragma map(inflate_fast,"INFA")
+ #pragma map(inflate_copyright,"INCOPY")
+#endif
+
+#endif /* ZCONF_H */
diff --git a/kmc_tools/libs/zlib.h b/kmc_tools/libs/zlib.h
new file mode 100644
index 0000000..3edf3ac
--- /dev/null
+++ b/kmc_tools/libs/zlib.h
@@ -0,0 +1,1744 @@
+/* zlib.h -- interface of the 'zlib' general purpose compression library
+ version 1.2.7, May 2nd, 2012
+
+ Copyright (C) 1995-2012 Jean-loup Gailly and Mark Adler
+
+ This software is provided 'as-is', without any express or implied
+ warranty. In no event will the authors be held liable for any damages
+ arising from the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you must not
+ claim that you wrote the original software. If you use this software
+ in a product, an acknowledgment in the product documentation would be
+ appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not be
+ misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source distribution.
+
+ Jean-loup Gailly Mark Adler
+ jloup at gzip.org madler at alumni.caltech.edu
+
+
+ The data format used by the zlib library is described by RFCs (Request for
+ Comments) 1950 to 1952 in the files http://tools.ietf.org/html/rfc1950
+ (zlib format), rfc1951 (deflate format) and rfc1952 (gzip format).
+*/
+
+#ifndef ZLIB_H
+#define ZLIB_H
+
+#include "zconf.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ZLIB_VERSION "1.2.7"
+#define ZLIB_VERNUM 0x1270
+#define ZLIB_VER_MAJOR 1
+#define ZLIB_VER_MINOR 2
+#define ZLIB_VER_REVISION 7
+#define ZLIB_VER_SUBREVISION 0
+
+/*
+ The 'zlib' compression library provides in-memory compression and
+ decompression functions, including integrity checks of the uncompressed data.
+ This version of the library supports only one compression method (deflation)
+ but other algorithms will be added later and will have the same stream
+ interface.
+
+ Compression can be done in a single step if the buffers are large enough,
+ or can be done by repeated calls of the compression function. In the latter
+ case, the application must provide more input and/or consume the output
+ (providing more output space) before each call.
+
+ The compressed data format used by default by the in-memory functions is
+ the zlib format, which is a zlib wrapper documented in RFC 1950, wrapped
+ around a deflate stream, which is itself documented in RFC 1951.
+
+ The library also supports reading and writing files in gzip (.gz) format
+ with an interface similar to that of stdio using the functions that start
+ with "gz". The gzip format is different from the zlib format. gzip is a
+ gzip wrapper, documented in RFC 1952, wrapped around a deflate stream.
+
+ This library can optionally read and write gzip streams in memory as well.
+
+ The zlib format was designed to be compact and fast for use in memory
+ and on communications channels. The gzip format was designed for single-
+ file compression on file systems, has a larger header than zlib to maintain
+ directory information, and uses a different, slower check method than zlib.
+
+ The library does not install any signal handler. The decoder checks
+ the consistency of the compressed data, so the library should never crash
+ even in case of corrupted input.
+*/
+
+typedef voidpf (*alloc_func) OF((voidpf opaque, uInt items, uInt size));
+typedef void (*free_func) OF((voidpf opaque, voidpf address));
+
+struct internal_state;
+
+typedef struct z_stream_s {
+ z_const Bytef *next_in; /* next input byte */
+ uInt avail_in; /* number of bytes available at next_in */
+ uLong total_in; /* total number of input bytes read so far */
+
+ Bytef *next_out; /* next output byte should be put there */
+ uInt avail_out; /* remaining free space at next_out */
+ uLong total_out; /* total number of bytes output so far */
+
+ z_const char *msg; /* last error message, NULL if no error */
+ struct internal_state FAR *state; /* not visible by applications */
+
+ alloc_func zalloc; /* used to allocate the internal state */
+ free_func zfree; /* used to free the internal state */
+ voidpf opaque; /* private data object passed to zalloc and zfree */
+
+ int data_type; /* best guess about the data type: binary or text */
+ uLong adler; /* adler32 value of the uncompressed data */
+ uLong reserved; /* reserved for future use */
+} z_stream;
+
+typedef z_stream FAR *z_streamp;
+
+/*
+ gzip header information passed to and from zlib routines. See RFC 1952
+ for more details on the meanings of these fields.
+*/
+typedef struct gz_header_s {
+ int text; /* true if compressed data believed to be text */
+ uLong time; /* modification time */
+ int xflags; /* extra flags (not used when writing a gzip file) */
+ int os; /* operating system */
+ Bytef *extra; /* pointer to extra field or Z_NULL if none */
+ uInt extra_len; /* extra field length (valid if extra != Z_NULL) */
+ uInt extra_max; /* space at extra (only when reading header) */
+ Bytef *name; /* pointer to zero-terminated file name or Z_NULL */
+ uInt name_max; /* space at name (only when reading header) */
+ Bytef *comment; /* pointer to zero-terminated comment or Z_NULL */
+ uInt comm_max; /* space at comment (only when reading header) */
+ int hcrc; /* true if there was or will be a header crc */
+ int done; /* true when done reading gzip header (not used
+ when writing a gzip file) */
+} gz_header;
+
+typedef gz_header FAR *gz_headerp;
+
+/*
+ The application must update next_in and avail_in when avail_in has dropped
+ to zero. It must update next_out and avail_out when avail_out has dropped
+ to zero. The application must initialize zalloc, zfree and opaque before
+ calling the init function. All other fields are set by the compression
+ library and must not be updated by the application.
+
+ The opaque value provided by the application will be passed as the first
+ parameter for calls of zalloc and zfree. This can be useful for custom
+ memory management. The compression library attaches no meaning to the
+ opaque value.
+
+ zalloc must return Z_NULL if there is not enough memory for the object.
+ If zlib is used in a multi-threaded application, zalloc and zfree must be
+ thread safe.
+
+ On 16-bit systems, the functions zalloc and zfree must be able to allocate
+ exactly 65536 bytes, but will not be required to allocate more than this if
+ the symbol MAXSEG_64K is defined (see zconf.h). WARNING: On MSDOS, pointers
+ returned by zalloc for objects of exactly 65536 bytes *must* have their
+ offset normalized to zero. The default allocation function provided by this
+ library ensures this (see zutil.c). To reduce memory requirements and avoid
+ any allocation of 64K objects, at the expense of compression ratio, compile
+ the library with -DMAX_WBITS=14 (see zconf.h).
+
+ The fields total_in and total_out can be used for statistics or progress
+ reports. After compression, total_in holds the total size of the
+ uncompressed data and may be saved for use in the decompressor (particularly
+ if the decompressor wants to decompress everything in a single step).
+*/
+
+ /* constants */
+
+#define Z_NO_FLUSH 0
+#define Z_PARTIAL_FLUSH 1
+#define Z_SYNC_FLUSH 2
+#define Z_FULL_FLUSH 3
+#define Z_FINISH 4
+#define Z_BLOCK 5
+#define Z_TREES 6
+/* Allowed flush values; see deflate() and inflate() below for details */
+
+#define Z_OK 0
+#define Z_STREAM_END 1
+#define Z_NEED_DICT 2
+#define Z_ERRNO (-1)
+#define Z_STREAM_ERROR (-2)
+#define Z_DATA_ERROR (-3)
+#define Z_MEM_ERROR (-4)
+#define Z_BUF_ERROR (-5)
+#define Z_VERSION_ERROR (-6)
+/* Return codes for the compression/decompression functions. Negative values
+ * are errors, positive values are used for special but normal events.
+ */
+
+#define Z_NO_COMPRESSION 0
+#define Z_BEST_SPEED 1
+#define Z_BEST_COMPRESSION 9
+#define Z_DEFAULT_COMPRESSION (-1)
+/* compression levels */
+
+#define Z_FILTERED 1
+#define Z_HUFFMAN_ONLY 2
+#define Z_RLE 3
+#define Z_FIXED 4
+#define Z_DEFAULT_STRATEGY 0
+/* compression strategy; see deflateInit2() below for details */
+
+#define Z_BINARY 0
+#define Z_TEXT 1
+#define Z_ASCII Z_TEXT /* for compatibility with 1.2.2 and earlier */
+#define Z_UNKNOWN 2
+/* Possible values of the data_type field (though see inflate()) */
+
+#define Z_DEFLATED 8
+/* The deflate compression method (the only one supported in this version) */
+
+#define Z_NULL 0 /* for initializing zalloc, zfree, opaque */
+
+#define zlib_version zlibVersion()
+/* for compatibility with versions < 1.0.2 */
+
+
+ /* basic functions */
+
+ZEXTERN const char * ZEXPORT zlibVersion OF((void));
+/* The application can compare zlibVersion and ZLIB_VERSION for consistency.
+ If the first character differs, the library code actually used is not
+ compatible with the zlib.h header file used by the application. This check
+ is automatically made by deflateInit and inflateInit.
+ */
+
+/*
+ZEXTERN int ZEXPORT deflateInit OF((z_streamp strm, int level));
+
+ Initializes the internal stream state for compression. The fields
+ zalloc, zfree and opaque must be initialized before by the caller. If
+ zalloc and zfree are set to Z_NULL, deflateInit updates them to use default
+ allocation functions.
+
+ The compression level must be Z_DEFAULT_COMPRESSION, or between 0 and 9:
+ 1 gives best speed, 9 gives best compression, 0 gives no compression at all
+ (the input data is simply copied a block at a time). Z_DEFAULT_COMPRESSION
+ requests a default compromise between speed and compression (currently
+ equivalent to level 6).
+
+ deflateInit returns Z_OK if success, Z_MEM_ERROR if there was not enough
+ memory, Z_STREAM_ERROR if level is not a valid compression level, or
+ Z_VERSION_ERROR if the zlib library version (zlib_version) is incompatible
+ with the version assumed by the caller (ZLIB_VERSION). msg is set to null
+ if there is no error message. deflateInit does not perform any compression:
+ this will be done by deflate().
+*/
+
+
+ZEXTERN int ZEXPORT deflate OF((z_streamp strm, int flush));
+/*
+ deflate compresses as much data as possible, and stops when the input
+ buffer becomes empty or the output buffer becomes full. It may introduce
+ some output latency (reading input without producing any output) except when
+ forced to flush.
+
+ The detailed semantics are as follows. deflate performs one or both of the
+ following actions:
+
+ - Compress more input starting at next_in and update next_in and avail_in
+ accordingly. If not all input can be processed (because there is not
+ enough room in the output buffer), next_in and avail_in are updated and
+ processing will resume at this point for the next call of deflate().
+
+ - Provide more output starting at next_out and update next_out and avail_out
+ accordingly. This action is forced if the parameter flush is non zero.
+ Forcing flush frequently degrades the compression ratio, so this parameter
+ should be set only when necessary (in interactive applications). Some
+ output may be provided even if flush is not set.
+
+ Before the call of deflate(), the application should ensure that at least
+ one of the actions is possible, by providing more input and/or consuming more
+ output, and updating avail_in or avail_out accordingly; avail_out should
+ never be zero before the call. The application can consume the compressed
+ output when it wants, for example when the output buffer is full (avail_out
+ == 0), or after each call of deflate(). If deflate returns Z_OK and with
+ zero avail_out, it must be called again after making room in the output
+ buffer because there might be more output pending.
+
+ Normally the parameter flush is set to Z_NO_FLUSH, which allows deflate to
+ decide how much data to accumulate before producing output, in order to
+ maximize compression.
+
+ If the parameter flush is set to Z_SYNC_FLUSH, all pending output is
+ flushed to the output buffer and the output is aligned on a byte boundary, so
+ that the decompressor can get all input data available so far. (In
+ particular avail_in is zero after the call if enough output space has been
+ provided before the call.) Flushing may degrade compression for some
+ compression algorithms and so it should be used only when necessary. This
+ completes the current deflate block and follows it with an empty stored block
+ that is three bits plus filler bits to the next byte, followed by four bytes
+ (00 00 ff ff).
+
+ If flush is set to Z_PARTIAL_FLUSH, all pending output is flushed to the
+ output buffer, but the output is not aligned to a byte boundary. All of the
+ input data so far will be available to the decompressor, as for Z_SYNC_FLUSH.
+ This completes the current deflate block and follows it with an empty fixed
+ codes block that is 10 bits long. This assures that enough bytes are output
+ in order for the decompressor to finish the block before the empty fixed code
+ block.
+
+ If flush is set to Z_BLOCK, a deflate block is completed and emitted, as
+ for Z_SYNC_FLUSH, but the output is not aligned on a byte boundary, and up to
+ seven bits of the current block are held to be written as the next byte after
+ the next deflate block is completed. In this case, the decompressor may not
+ be provided enough bits at this point in order to complete decompression of
+ the data provided so far to the compressor. It may need to wait for the next
+ block to be emitted. This is for advanced applications that need to control
+ the emission of deflate blocks.
+
+ If flush is set to Z_FULL_FLUSH, all output is flushed as with
+ Z_SYNC_FLUSH, and the compression state is reset so that decompression can
+ restart from this point if previous compressed data has been damaged or if
+ random access is desired. Using Z_FULL_FLUSH too often can seriously degrade
+ compression.
+
+ If deflate returns with avail_out == 0, this function must be called again
+ with the same value of the flush parameter and more output space (updated
+ avail_out), until the flush is complete (deflate returns with non-zero
+ avail_out). In the case of a Z_FULL_FLUSH or Z_SYNC_FLUSH, make sure that
+ avail_out is greater than six to avoid repeated flush markers due to
+ avail_out == 0 on return.
+
+ If the parameter flush is set to Z_FINISH, pending input is processed,
+ pending output is flushed and deflate returns with Z_STREAM_END if there was
+ enough output space; if deflate returns with Z_OK, this function must be
+ called again with Z_FINISH and more output space (updated avail_out) but no
+ more input data, until it returns with Z_STREAM_END or an error. After
+ deflate has returned Z_STREAM_END, the only possible operations on the stream
+ are deflateReset or deflateEnd.
+
+ Z_FINISH can be used immediately after deflateInit if all the compression
+ is to be done in a single step. In this case, avail_out must be at least the
+ value returned by deflateBound (see below). Then deflate is guaranteed to
+ return Z_STREAM_END. If not enough output space is provided, deflate will
+ not return Z_STREAM_END, and it must be called again as described above.
+
+ deflate() sets strm->adler to the adler32 checksum of all input read
+ so far (that is, total_in bytes).
+
+ deflate() may update strm->data_type if it can make a good guess about
+ the input data type (Z_BINARY or Z_TEXT). In doubt, the data is considered
+ binary. This field is only for information purposes and does not affect the
+ compression algorithm in any manner.
+
+ deflate() returns Z_OK if some progress has been made (more input
+ processed or more output produced), Z_STREAM_END if all input has been
+ consumed and all output has been produced (only when flush is set to
+ Z_FINISH), Z_STREAM_ERROR if the stream state was inconsistent (for example
+ if next_in or next_out was Z_NULL), Z_BUF_ERROR if no progress is possible
+ (for example avail_in or avail_out was zero). Note that Z_BUF_ERROR is not
+ fatal, and deflate() can be called again with more input and more output
+ space to continue compressing.
+*/
+
+
+ZEXTERN int ZEXPORT deflateEnd OF((z_streamp strm));
+/*
+ All dynamically allocated data structures for this stream are freed.
+ This function discards any unprocessed input and does not flush any pending
+ output.
+
+ deflateEnd returns Z_OK if success, Z_STREAM_ERROR if the
+ stream state was inconsistent, Z_DATA_ERROR if the stream was freed
+ prematurely (some input or output was discarded). In the error case, msg
+ may be set but then points to a static string (which must not be
+ deallocated).
+*/
+
+
+/*
+ZEXTERN int ZEXPORT inflateInit OF((z_streamp strm));
+
+ Initializes the internal stream state for decompression. The fields
+ next_in, avail_in, zalloc, zfree and opaque must be initialized before by
+ the caller. If next_in is not Z_NULL and avail_in is large enough (the
+ exact value depends on the compression method), inflateInit determines the
+ compression method from the zlib header and allocates all data structures
+ accordingly; otherwise the allocation will be deferred to the first call of
+ inflate. If zalloc and zfree are set to Z_NULL, inflateInit updates them to
+ use default allocation functions.
+
+ inflateInit returns Z_OK if success, Z_MEM_ERROR if there was not enough
+ memory, Z_VERSION_ERROR if the zlib library version is incompatible with the
+ version assumed by the caller, or Z_STREAM_ERROR if the parameters are
+ invalid, such as a null pointer to the structure. msg is set to null if
+ there is no error message. inflateInit does not perform any decompression
+ apart from possibly reading the zlib header if present: actual decompression
+ will be done by inflate(). (So next_in and avail_in may be modified, but
+ next_out and avail_out are unused and unchanged.) The current implementation
+ of inflateInit() does not process any header information -- that is deferred
+ until inflate() is called.
+*/
+
+
+ZEXTERN int ZEXPORT inflate OF((z_streamp strm, int flush));
+/*
+ inflate decompresses as much data as possible, and stops when the input
+ buffer becomes empty or the output buffer becomes full. It may introduce
+ some output latency (reading input without producing any output) except when
+ forced to flush.
+
+ The detailed semantics are as follows. inflate performs one or both of the
+ following actions:
+
+ - Decompress more input starting at next_in and update next_in and avail_in
+ accordingly. If not all input can be processed (because there is not
+ enough room in the output buffer), next_in is updated and processing will
+ resume at this point for the next call of inflate().
+
+ - Provide more output starting at next_out and update next_out and avail_out
+ accordingly. inflate() provides as much output as possible, until there is
+ no more input data or no more space in the output buffer (see below about
+ the flush parameter).
+
+ Before the call of inflate(), the application should ensure that at least
+ one of the actions is possible, by providing more input and/or consuming more
+ output, and updating the next_* and avail_* values accordingly. The
+ application can consume the uncompressed output when it wants, for example
+ when the output buffer is full (avail_out == 0), or after each call of
+ inflate(). If inflate returns Z_OK and with zero avail_out, it must be
+ called again after making room in the output buffer because there might be
+ more output pending.
+
+ The flush parameter of inflate() can be Z_NO_FLUSH, Z_SYNC_FLUSH, Z_FINISH,
+ Z_BLOCK, or Z_TREES. Z_SYNC_FLUSH requests that inflate() flush as much
+ output as possible to the output buffer. Z_BLOCK requests that inflate()
+ stop if and when it gets to the next deflate block boundary. When decoding
+ the zlib or gzip format, this will cause inflate() to return immediately
+ after the header and before the first block. When doing a raw inflate,
+ inflate() will go ahead and process the first block, and will return when it
+ gets to the end of that block, or when it runs out of data.
+
+ The Z_BLOCK option assists in appending to or combining deflate streams.
+ Also to assist in this, on return inflate() will set strm->data_type to the
+ number of unused bits in the last byte taken from strm->next_in, plus 64 if
+ inflate() is currently decoding the last block in the deflate stream, plus
+ 128 if inflate() returned immediately after decoding an end-of-block code or
+ decoding the complete header up to just before the first byte of the deflate
+ stream. The end-of-block will not be indicated until all of the uncompressed
+ data from that block has been written to strm->next_out. The number of
+ unused bits may in general be greater than seven, except when bit 7 of
+ data_type is set, in which case the number of unused bits will be less than
+ eight. data_type is set as noted here every time inflate() returns for all
+ flush options, and so can be used to determine the amount of currently
+ consumed input in bits.
+
+ The Z_TREES option behaves as Z_BLOCK does, but it also returns when the
+ end of each deflate block header is reached, before any actual data in that
+ block is decoded. This allows the caller to determine the length of the
+ deflate block header for later use in random access within a deflate block.
+ 256 is added to the value of strm->data_type when inflate() returns
+ immediately after reaching the end of the deflate block header.
+
+ inflate() should normally be called until it returns Z_STREAM_END or an
+ error. However if all decompression is to be performed in a single step (a
+ single call of inflate), the parameter flush should be set to Z_FINISH. In
+ this case all pending input is processed and all pending output is flushed;
+ avail_out must be large enough to hold all of the uncompressed data for the
+ operation to complete. (The size of the uncompressed data may have been
+ saved by the compressor for this purpose.) The use of Z_FINISH is not
+ required to perform an inflation in one step. However it may be used to
+ inform inflate that a faster approach can be used for the single inflate()
+ call. Z_FINISH also informs inflate to not maintain a sliding window if the
+ stream completes, which reduces inflate's memory footprint. If the stream
+ does not complete, either because not all of the stream is provided or not
+ enough output space is provided, then a sliding window will be allocated and
+ inflate() can be called again to continue the operation as if Z_NO_FLUSH had
+ been used.
+
+ In this implementation, inflate() always flushes as much output as
+ possible to the output buffer, and always uses the faster approach on the
+ first call. So the effects of the flush parameter in this implementation are
+ on the return value of inflate() as noted below, when inflate() returns early
+ when Z_BLOCK or Z_TREES is used, and when inflate() avoids the allocation of
+ memory for a sliding window when Z_FINISH is used.
+
+ If a preset dictionary is needed after this call (see inflateSetDictionary
+ below), inflate sets strm->adler to the Adler-32 checksum of the dictionary
+ chosen by the compressor and returns Z_NEED_DICT; otherwise it sets
+ strm->adler to the Adler-32 checksum of all output produced so far (that is,
+ total_out bytes) and returns Z_OK, Z_STREAM_END or an error code as described
+ below. At the end of the stream, inflate() checks that its computed adler32
+ checksum is equal to that saved by the compressor and returns Z_STREAM_END
+ only if the checksum is correct.
+
+ inflate() can decompress and check either zlib-wrapped or gzip-wrapped
+ deflate data. The header type is detected automatically, if requested when
+ initializing with inflateInit2(). Any information contained in the gzip
+ header is not retained, so applications that need that information should
+ instead use raw inflate, see inflateInit2() below, or inflateBack() and
+ perform their own processing of the gzip header and trailer. When processing
+ gzip-wrapped deflate data, strm->adler32 is set to the CRC-32 of the output
+ producted so far. The CRC-32 is checked against the gzip trailer.
+
+ inflate() returns Z_OK if some progress has been made (more input processed
+ or more output produced), Z_STREAM_END if the end of the compressed data has
+ been reached and all uncompressed output has been produced, Z_NEED_DICT if a
+ preset dictionary is needed at this point, Z_DATA_ERROR if the input data was
+ corrupted (input stream not conforming to the zlib format or incorrect check
+ value), Z_STREAM_ERROR if the stream structure was inconsistent (for example
+ next_in or next_out was Z_NULL), Z_MEM_ERROR if there was not enough memory,
+ Z_BUF_ERROR if no progress is possible or if there was not enough room in the
+ output buffer when Z_FINISH is used. Note that Z_BUF_ERROR is not fatal, and
+ inflate() can be called again with more input and more output space to
+ continue decompressing. If Z_DATA_ERROR is returned, the application may
+ then call inflateSync() to look for a good compression block if a partial
+ recovery of the data is desired.
+*/
+
+
+ZEXTERN int ZEXPORT inflateEnd OF((z_streamp strm));
+/*
+ All dynamically allocated data structures for this stream are freed.
+ This function discards any unprocessed input and does not flush any pending
+ output.
+
+ inflateEnd returns Z_OK if success, Z_STREAM_ERROR if the stream state
+ was inconsistent. In the error case, msg may be set but then points to a
+ static string (which must not be deallocated).
+*/
+
+
+ /* Advanced functions */
+
+/*
+ The following functions are needed only in some special applications.
+*/
+
+/*
+ZEXTERN int ZEXPORT deflateInit2 OF((z_streamp strm,
+ int level,
+ int method,
+ int windowBits,
+ int memLevel,
+ int strategy));
+
+ This is another version of deflateInit with more compression options. The
+ fields next_in, zalloc, zfree and opaque must be initialized before by the
+ caller.
+
+ The method parameter is the compression method. It must be Z_DEFLATED in
+ this version of the library.
+
+ The windowBits parameter is the base two logarithm of the window size
+ (the size of the history buffer). It should be in the range 8..15 for this
+ version of the library. Larger values of this parameter result in better
+ compression at the expense of memory usage. The default value is 15 if
+ deflateInit is used instead.
+
+ windowBits can also be -8..-15 for raw deflate. In this case, -windowBits
+ determines the window size. deflate() will then generate raw deflate data
+ with no zlib header or trailer, and will not compute an adler32 check value.
+
+ windowBits can also be greater than 15 for optional gzip encoding. Add
+ 16 to windowBits to write a simple gzip header and trailer around the
+ compressed data instead of a zlib wrapper. The gzip header will have no
+ file name, no extra data, no comment, no modification time (set to zero), no
+ header crc, and the operating system will be set to 255 (unknown). If a
+ gzip stream is being written, strm->adler is a crc32 instead of an adler32.
+
+ The memLevel parameter specifies how much memory should be allocated
+ for the internal compression state. memLevel=1 uses minimum memory but is
+ slow and reduces compression ratio; memLevel=9 uses maximum memory for
+ optimal speed. The default value is 8. See zconf.h for total memory usage
+ as a function of windowBits and memLevel.
+
+ The strategy parameter is used to tune the compression algorithm. Use the
+ value Z_DEFAULT_STRATEGY for normal data, Z_FILTERED for data produced by a
+ filter (or predictor), Z_HUFFMAN_ONLY to force Huffman encoding only (no
+ string match), or Z_RLE to limit match distances to one (run-length
+ encoding). Filtered data consists mostly of small values with a somewhat
+ random distribution. In this case, the compression algorithm is tuned to
+ compress them better. The effect of Z_FILTERED is to force more Huffman
+ coding and less string matching; it is somewhat intermediate between
+ Z_DEFAULT_STRATEGY and Z_HUFFMAN_ONLY. Z_RLE is designed to be almost as
+ fast as Z_HUFFMAN_ONLY, but give better compression for PNG image data. The
+ strategy parameter only affects the compression ratio but not the
+ correctness of the compressed output even if it is not set appropriately.
+ Z_FIXED prevents the use of dynamic Huffman codes, allowing for a simpler
+ decoder for special applications.
+
+ deflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+ memory, Z_STREAM_ERROR if any parameter is invalid (such as an invalid
+ method), or Z_VERSION_ERROR if the zlib library version (zlib_version) is
+ incompatible with the version assumed by the caller (ZLIB_VERSION). msg is
+ set to null if there is no error message. deflateInit2 does not perform any
+ compression: this will be done by deflate().
+*/
+
+ZEXTERN int ZEXPORT deflateSetDictionary OF((z_streamp strm,
+ const Bytef *dictionary,
+ uInt dictLength));
+/*
+ Initializes the compression dictionary from the given byte sequence
+ without producing any compressed output. When using the zlib format, this
+ function must be called immediately after deflateInit, deflateInit2 or
+ deflateReset, and before any call of deflate. When doing raw deflate, this
+ function must be called either before any call of deflate, or immediately
+ after the completion of a deflate block, i.e. after all input has been
+ consumed and all output has been delivered when using any of the flush
+ options Z_BLOCK, Z_PARTIAL_FLUSH, Z_SYNC_FLUSH, or Z_FULL_FLUSH. The
+ compressor and decompressor must use exactly the same dictionary (see
+ inflateSetDictionary).
+
+ The dictionary should consist of strings (byte sequences) that are likely
+ to be encountered later in the data to be compressed, with the most commonly
+ used strings preferably put towards the end of the dictionary. Using a
+ dictionary is most useful when the data to be compressed is short and can be
+ predicted with good accuracy; the data can then be compressed better than
+ with the default empty dictionary.
+
+ Depending on the size of the compression data structures selected by
+ deflateInit or deflateInit2, a part of the dictionary may in effect be
+ discarded, for example if the dictionary is larger than the window size
+ provided in deflateInit or deflateInit2. Thus the strings most likely to be
+ useful should be put at the end of the dictionary, not at the front. In
+ addition, the current implementation of deflate will use at most the window
+ size minus 262 bytes of the provided dictionary.
+
+ Upon return of this function, strm->adler is set to the adler32 value
+ of the dictionary; the decompressor may later use this value to determine
+ which dictionary has been used by the compressor. (The adler32 value
+ applies to the whole dictionary even if only a subset of the dictionary is
+ actually used by the compressor.) If a raw deflate was requested, then the
+ adler32 value is not computed and strm->adler is not set.
+
+ deflateSetDictionary returns Z_OK if success, or Z_STREAM_ERROR if a
+ parameter is invalid (e.g. dictionary being Z_NULL) or the stream state is
+ inconsistent (for example if deflate has already been called for this stream
+ or if not at a block boundary for raw deflate). deflateSetDictionary does
+ not perform any compression: this will be done by deflate().
+*/
+
+ZEXTERN int ZEXPORT deflateCopy OF((z_streamp dest,
+ z_streamp source));
+/*
+ Sets the destination stream as a complete copy of the source stream.
+
+ This function can be useful when several compression strategies will be
+ tried, for example when there are several ways of pre-processing the input
+ data with a filter. The streams that will be discarded should then be freed
+ by calling deflateEnd. Note that deflateCopy duplicates the internal
+ compression state which can be quite large, so this strategy is slow and can
+ consume lots of memory.
+
+ deflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not
+ enough memory, Z_STREAM_ERROR if the source stream state was inconsistent
+ (such as zalloc being Z_NULL). msg is left unchanged in both source and
+ destination.
+*/
+
+ZEXTERN int ZEXPORT deflateReset OF((z_streamp strm));
+/*
+ This function is equivalent to deflateEnd followed by deflateInit,
+ but does not free and reallocate all the internal compression state. The
+ stream will keep the same compression level and any other attributes that
+ may have been set by deflateInit2.
+
+ deflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source
+ stream state was inconsistent (such as zalloc or state being Z_NULL).
+*/
+
+ZEXTERN int ZEXPORT deflateParams OF((z_streamp strm,
+ int level,
+ int strategy));
+/*
+ Dynamically update the compression level and compression strategy. The
+ interpretation of level and strategy is as in deflateInit2. This can be
+ used to switch between compression and straight copy of the input data, or
+ to switch to a different kind of input data requiring a different strategy.
+ If the compression level is changed, the input available so far is
+ compressed with the old level (and may be flushed); the new level will take
+ effect only at the next call of deflate().
+
+ Before the call of deflateParams, the stream state must be set as for
+ a call of deflate(), since the currently available input may have to be
+ compressed and flushed. In particular, strm->avail_out must be non-zero.
+
+ deflateParams returns Z_OK if success, Z_STREAM_ERROR if the source
+ stream state was inconsistent or if a parameter was invalid, Z_BUF_ERROR if
+ strm->avail_out was zero.
+*/
+
+ZEXTERN int ZEXPORT deflateTune OF((z_streamp strm,
+ int good_length,
+ int max_lazy,
+ int nice_length,
+ int max_chain));
+/*
+ Fine tune deflate's internal compression parameters. This should only be
+ used by someone who understands the algorithm used by zlib's deflate for
+ searching for the best matching string, and even then only by the most
+ fanatic optimizer trying to squeeze out the last compressed bit for their
+ specific input data. Read the deflate.c source code for the meaning of the
+ max_lazy, good_length, nice_length, and max_chain parameters.
+
+ deflateTune() can be called after deflateInit() or deflateInit2(), and
+ returns Z_OK on success, or Z_STREAM_ERROR for an invalid deflate stream.
+ */
+
+ZEXTERN uLong ZEXPORT deflateBound OF((z_streamp strm,
+ uLong sourceLen));
+/*
+ deflateBound() returns an upper bound on the compressed size after
+ deflation of sourceLen bytes. It must be called after deflateInit() or
+ deflateInit2(), and after deflateSetHeader(), if used. This would be used
+ to allocate an output buffer for deflation in a single pass, and so would be
+ called before deflate(). If that first deflate() call is provided the
+ sourceLen input bytes, an output buffer allocated to the size returned by
+ deflateBound(), and the flush value Z_FINISH, then deflate() is guaranteed
+ to return Z_STREAM_END. Note that it is possible for the compressed size to
+ be larger than the value returned by deflateBound() if flush options other
+ than Z_FINISH or Z_NO_FLUSH are used.
+*/
+
+ZEXTERN int ZEXPORT deflatePending OF((z_streamp strm,
+ unsigned *pending,
+ int *bits));
+/*
+ deflatePending() returns the number of bytes and bits of output that have
+ been generated, but not yet provided in the available output. The bytes not
+ provided would be due to the available output space having being consumed.
+ The number of bits of output not provided are between 0 and 7, where they
+ await more bits to join them in order to fill out a full byte. If pending
+ or bits are Z_NULL, then those values are not set.
+
+ deflatePending returns Z_OK if success, or Z_STREAM_ERROR if the source
+ stream state was inconsistent.
+ */
+
+ZEXTERN int ZEXPORT deflatePrime OF((z_streamp strm,
+ int bits,
+ int value));
+/*
+ deflatePrime() inserts bits in the deflate output stream. The intent
+ is that this function is used to start off the deflate output with the bits
+ leftover from a previous deflate stream when appending to it. As such, this
+ function can only be used for raw deflate, and must be used before the first
+ deflate() call after a deflateInit2() or deflateReset(). bits must be less
+ than or equal to 16, and that many of the least significant bits of value
+ will be inserted in the output.
+
+ deflatePrime returns Z_OK if success, Z_BUF_ERROR if there was not enough
+ room in the internal buffer to insert the bits, or Z_STREAM_ERROR if the
+ source stream state was inconsistent.
+*/
+
+ZEXTERN int ZEXPORT deflateSetHeader OF((z_streamp strm,
+ gz_headerp head));
+/*
+ deflateSetHeader() provides gzip header information for when a gzip
+ stream is requested by deflateInit2(). deflateSetHeader() may be called
+ after deflateInit2() or deflateReset() and before the first call of
+ deflate(). The text, time, os, extra field, name, and comment information
+ in the provided gz_header structure are written to the gzip header (xflag is
+ ignored -- the extra flags are set according to the compression level). The
+ caller must assure that, if not Z_NULL, name and comment are terminated with
+ a zero byte, and that if extra is not Z_NULL, that extra_len bytes are
+ available there. If hcrc is true, a gzip header crc is included. Note that
+ the current versions of the command-line version of gzip (up through version
+ 1.3.x) do not support header crc's, and will report that it is a "multi-part
+ gzip file" and give up.
+
+ If deflateSetHeader is not used, the default gzip header has text false,
+ the time set to zero, and os set to 255, with no extra, name, or comment
+ fields. The gzip header is returned to the default state by deflateReset().
+
+ deflateSetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source
+ stream state was inconsistent.
+*/
+
+/*
+ZEXTERN int ZEXPORT inflateInit2 OF((z_streamp strm,
+ int windowBits));
+
+ This is another version of inflateInit with an extra parameter. The
+ fields next_in, avail_in, zalloc, zfree and opaque must be initialized
+ before by the caller.
+
+ The windowBits parameter is the base two logarithm of the maximum window
+ size (the size of the history buffer). It should be in the range 8..15 for
+ this version of the library. The default value is 15 if inflateInit is used
+ instead. windowBits must be greater than or equal to the windowBits value
+ provided to deflateInit2() while compressing, or it must be equal to 15 if
+ deflateInit2() was not used. If a compressed stream with a larger window
+ size is given as input, inflate() will return with the error code
+ Z_DATA_ERROR instead of trying to allocate a larger window.
+
+ windowBits can also be zero to request that inflate use the window size in
+ the zlib header of the compressed stream.
+
+ windowBits can also be -8..-15 for raw inflate. In this case, -windowBits
+ determines the window size. inflate() will then process raw deflate data,
+ not looking for a zlib or gzip header, not generating a check value, and not
+ looking for any check values for comparison at the end of the stream. This
+ is for use with other formats that use the deflate compressed data format
+ such as zip. Those formats provide their own check values. If a custom
+ format is developed using the raw deflate format for compressed data, it is
+ recommended that a check value such as an adler32 or a crc32 be applied to
+ the uncompressed data as is done in the zlib, gzip, and zip formats. For
+ most applications, the zlib format should be used as is. Note that comments
+ above on the use in deflateInit2() applies to the magnitude of windowBits.
+
+ windowBits can also be greater than 15 for optional gzip decoding. Add
+ 32 to windowBits to enable zlib and gzip decoding with automatic header
+ detection, or add 16 to decode only the gzip format (the zlib format will
+ return a Z_DATA_ERROR). If a gzip stream is being decoded, strm->adler is a
+ crc32 instead of an adler32.
+
+ inflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+ memory, Z_VERSION_ERROR if the zlib library version is incompatible with the
+ version assumed by the caller, or Z_STREAM_ERROR if the parameters are
+ invalid, such as a null pointer to the structure. msg is set to null if
+ there is no error message. inflateInit2 does not perform any decompression
+ apart from possibly reading the zlib header if present: actual decompression
+ will be done by inflate(). (So next_in and avail_in may be modified, but
+ next_out and avail_out are unused and unchanged.) The current implementation
+ of inflateInit2() does not process any header information -- that is
+ deferred until inflate() is called.
+*/
+
+ZEXTERN int ZEXPORT inflateSetDictionary OF((z_streamp strm,
+ const Bytef *dictionary,
+ uInt dictLength));
+/*
+ Initializes the decompression dictionary from the given uncompressed byte
+ sequence. This function must be called immediately after a call of inflate,
+ if that call returned Z_NEED_DICT. The dictionary chosen by the compressor
+ can be determined from the adler32 value returned by that call of inflate.
+ The compressor and decompressor must use exactly the same dictionary (see
+ deflateSetDictionary). For raw inflate, this function can be called at any
+ time to set the dictionary. If the provided dictionary is smaller than the
+ window and there is already data in the window, then the provided dictionary
+ will amend what's there. The application must insure that the dictionary
+ that was used for compression is provided.
+
+ inflateSetDictionary returns Z_OK if success, Z_STREAM_ERROR if a
+ parameter is invalid (e.g. dictionary being Z_NULL) or the stream state is
+ inconsistent, Z_DATA_ERROR if the given dictionary doesn't match the
+ expected one (incorrect adler32 value). inflateSetDictionary does not
+ perform any decompression: this will be done by subsequent calls of
+ inflate().
+*/
+
+ZEXTERN int ZEXPORT inflateSync OF((z_streamp strm));
+/*
+ Skips invalid compressed data until a possible full flush point (see above
+ for the description of deflate with Z_FULL_FLUSH) can be found, or until all
+ available input is skipped. No output is provided.
+
+ inflateSync searches for a 00 00 FF FF pattern in the compressed data.
+ All full flush points have this pattern, but not all occurences of this
+ pattern are full flush points.
+
+ inflateSync returns Z_OK if a possible full flush point has been found,
+ Z_BUF_ERROR if no more input was provided, Z_DATA_ERROR if no flush point
+ has been found, or Z_STREAM_ERROR if the stream structure was inconsistent.
+ In the success case, the application may save the current current value of
+ total_in which indicates where valid compressed data was found. In the
+ error case, the application may repeatedly call inflateSync, providing more
+ input each time, until success or end of the input data.
+*/
+
+ZEXTERN int ZEXPORT inflateCopy OF((z_streamp dest,
+ z_streamp source));
+/*
+ Sets the destination stream as a complete copy of the source stream.
+
+ This function can be useful when randomly accessing a large stream. The
+ first pass through the stream can periodically record the inflate state,
+ allowing restarting inflate at those points when randomly accessing the
+ stream.
+
+ inflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not
+ enough memory, Z_STREAM_ERROR if the source stream state was inconsistent
+ (such as zalloc being Z_NULL). msg is left unchanged in both source and
+ destination.
+*/
+
+ZEXTERN int ZEXPORT inflateReset OF((z_streamp strm));
+/*
+ This function is equivalent to inflateEnd followed by inflateInit,
+ but does not free and reallocate all the internal decompression state. The
+ stream will keep attributes that may have been set by inflateInit2.
+
+ inflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source
+ stream state was inconsistent (such as zalloc or state being Z_NULL).
+*/
+
+ZEXTERN int ZEXPORT inflateReset2 OF((z_streamp strm,
+ int windowBits));
+/*
+ This function is the same as inflateReset, but it also permits changing
+ the wrap and window size requests. The windowBits parameter is interpreted
+ the same as it is for inflateInit2.
+
+ inflateReset2 returns Z_OK if success, or Z_STREAM_ERROR if the source
+ stream state was inconsistent (such as zalloc or state being Z_NULL), or if
+ the windowBits parameter is invalid.
+*/
+
+ZEXTERN int ZEXPORT inflatePrime OF((z_streamp strm,
+ int bits,
+ int value));
+/*
+ This function inserts bits in the inflate input stream. The intent is
+ that this function is used to start inflating at a bit position in the
+ middle of a byte. The provided bits will be used before any bytes are used
+ from next_in. This function should only be used with raw inflate, and
+ should be used before the first inflate() call after inflateInit2() or
+ inflateReset(). bits must be less than or equal to 16, and that many of the
+ least significant bits of value will be inserted in the input.
+
+ If bits is negative, then the input stream bit buffer is emptied. Then
+ inflatePrime() can be called again to put bits in the buffer. This is used
+ to clear out bits leftover after feeding inflate a block description prior
+ to feeding inflate codes.
+
+ inflatePrime returns Z_OK if success, or Z_STREAM_ERROR if the source
+ stream state was inconsistent.
+*/
+
+ZEXTERN long ZEXPORT inflateMark OF((z_streamp strm));
+/*
+ This function returns two values, one in the lower 16 bits of the return
+ value, and the other in the remaining upper bits, obtained by shifting the
+ return value down 16 bits. If the upper value is -1 and the lower value is
+ zero, then inflate() is currently decoding information outside of a block.
+ If the upper value is -1 and the lower value is non-zero, then inflate is in
+ the middle of a stored block, with the lower value equaling the number of
+ bytes from the input remaining to copy. If the upper value is not -1, then
+ it is the number of bits back from the current bit position in the input of
+ the code (literal or length/distance pair) currently being processed. In
+ that case the lower value is the number of bytes already emitted for that
+ code.
+
+ A code is being processed if inflate is waiting for more input to complete
+ decoding of the code, or if it has completed decoding but is waiting for
+ more output space to write the literal or match data.
+
+ inflateMark() is used to mark locations in the input data for random
+ access, which may be at bit positions, and to note those cases where the
+ output of a code may span boundaries of random access blocks. The current
+ location in the input stream can be determined from avail_in and data_type
+ as noted in the description for the Z_BLOCK flush parameter for inflate.
+
+ inflateMark returns the value noted above or -1 << 16 if the provided
+ source stream state was inconsistent.
+*/
+
+ZEXTERN int ZEXPORT inflateGetHeader OF((z_streamp strm,
+ gz_headerp head));
+/*
+ inflateGetHeader() requests that gzip header information be stored in the
+ provided gz_header structure. inflateGetHeader() may be called after
+ inflateInit2() or inflateReset(), and before the first call of inflate().
+ As inflate() processes the gzip stream, head->done is zero until the header
+ is completed, at which time head->done is set to one. If a zlib stream is
+ being decoded, then head->done is set to -1 to indicate that there will be
+ no gzip header information forthcoming. Note that Z_BLOCK or Z_TREES can be
+ used to force inflate() to return immediately after header processing is
+ complete and before any actual data is decompressed.
+
+ The text, time, xflags, and os fields are filled in with the gzip header
+ contents. hcrc is set to true if there is a header CRC. (The header CRC
+ was valid if done is set to one.) If extra is not Z_NULL, then extra_max
+ contains the maximum number of bytes to write to extra. Once done is true,
+ extra_len contains the actual extra field length, and extra contains the
+ extra field, or that field truncated if extra_max is less than extra_len.
+ If name is not Z_NULL, then up to name_max characters are written there,
+ terminated with a zero unless the length is greater than name_max. If
+ comment is not Z_NULL, then up to comm_max characters are written there,
+ terminated with a zero unless the length is greater than comm_max. When any
+ of extra, name, or comment are not Z_NULL and the respective field is not
+ present in the header, then that field is set to Z_NULL to signal its
+ absence. This allows the use of deflateSetHeader() with the returned
+ structure to duplicate the header. However if those fields are set to
+ allocated memory, then the application will need to save those pointers
+ elsewhere so that they can be eventually freed.
+
+ If inflateGetHeader is not used, then the header information is simply
+ discarded. The header is always checked for validity, including the header
+ CRC if present. inflateReset() will reset the process to discard the header
+ information. The application would need to call inflateGetHeader() again to
+ retrieve the header from the next gzip stream.
+
+ inflateGetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source
+ stream state was inconsistent.
+*/
+
+/*
+ZEXTERN int ZEXPORT inflateBackInit OF((z_streamp strm, int windowBits,
+ unsigned char FAR *window));
+
+ Initialize the internal stream state for decompression using inflateBack()
+ calls. The fields zalloc, zfree and opaque in strm must be initialized
+ before the call. If zalloc and zfree are Z_NULL, then the default library-
+ derived memory allocation routines are used. windowBits is the base two
+ logarithm of the window size, in the range 8..15. window is a caller
+ supplied buffer of that size. Except for special applications where it is
+ assured that deflate was used with small window sizes, windowBits must be 15
+ and a 32K byte window must be supplied to be able to decompress general
+ deflate streams.
+
+ See inflateBack() for the usage of these routines.
+
+ inflateBackInit will return Z_OK on success, Z_STREAM_ERROR if any of
+ the parameters are invalid, Z_MEM_ERROR if the internal state could not be
+ allocated, or Z_VERSION_ERROR if the version of the library does not match
+ the version of the header file.
+*/
+
+typedef unsigned (*in_func) OF((void FAR *, unsigned char FAR * FAR *));
+typedef int (*out_func) OF((void FAR *, unsigned char FAR *, unsigned));
+
+ZEXTERN int ZEXPORT inflateBack OF((z_streamp strm,
+ in_func in, void FAR *in_desc,
+ out_func out, void FAR *out_desc));
+/*
+ inflateBack() does a raw inflate with a single call using a call-back
+ interface for input and output. This is more efficient than inflate() for
+ file i/o applications in that it avoids copying between the output and the
+ sliding window by simply making the window itself the output buffer. This
+ function trusts the application to not change the output buffer passed by
+ the output function, at least until inflateBack() returns.
+
+ inflateBackInit() must be called first to allocate the internal state
+ and to initialize the state with the user-provided window buffer.
+ inflateBack() may then be used multiple times to inflate a complete, raw
+ deflate stream with each call. inflateBackEnd() is then called to free the
+ allocated state.
+
+ A raw deflate stream is one with no zlib or gzip header or trailer.
+ This routine would normally be used in a utility that reads zip or gzip
+ files and writes out uncompressed files. The utility would decode the
+ header and process the trailer on its own, hence this routine expects only
+ the raw deflate stream to decompress. This is different from the normal
+ behavior of inflate(), which expects either a zlib or gzip header and
+ trailer around the deflate stream.
+
+ inflateBack() uses two subroutines supplied by the caller that are then
+ called by inflateBack() for input and output. inflateBack() calls those
+ routines until it reads a complete deflate stream and writes out all of the
+ uncompressed data, or until it encounters an error. The function's
+ parameters and return types are defined above in the in_func and out_func
+ typedefs. inflateBack() will call in(in_desc, &buf) which should return the
+ number of bytes of provided input, and a pointer to that input in buf. If
+ there is no input available, in() must return zero--buf is ignored in that
+ case--and inflateBack() will return a buffer error. inflateBack() will call
+ out(out_desc, buf, len) to write the uncompressed data buf[0..len-1]. out()
+ should return zero on success, or non-zero on failure. If out() returns
+ non-zero, inflateBack() will return with an error. Neither in() nor out()
+ are permitted to change the contents of the window provided to
+ inflateBackInit(), which is also the buffer that out() uses to write from.
+ The length written by out() will be at most the window size. Any non-zero
+ amount of input may be provided by in().
+
+ For convenience, inflateBack() can be provided input on the first call by
+ setting strm->next_in and strm->avail_in. If that input is exhausted, then
+ in() will be called. Therefore strm->next_in must be initialized before
+ calling inflateBack(). If strm->next_in is Z_NULL, then in() will be called
+ immediately for input. If strm->next_in is not Z_NULL, then strm->avail_in
+ must also be initialized, and then if strm->avail_in is not zero, input will
+ initially be taken from strm->next_in[0 .. strm->avail_in - 1].
+
+ The in_desc and out_desc parameters of inflateBack() is passed as the
+ first parameter of in() and out() respectively when they are called. These
+ descriptors can be optionally used to pass any information that the caller-
+ supplied in() and out() functions need to do their job.
+
+ On return, inflateBack() will set strm->next_in and strm->avail_in to
+ pass back any unused input that was provided by the last in() call. The
+ return values of inflateBack() can be Z_STREAM_END on success, Z_BUF_ERROR
+ if in() or out() returned an error, Z_DATA_ERROR if there was a format error
+ in the deflate stream (in which case strm->msg is set to indicate the nature
+ of the error), or Z_STREAM_ERROR if the stream was not properly initialized.
+ In the case of Z_BUF_ERROR, an input or output error can be distinguished
+ using strm->next_in which will be Z_NULL only if in() returned an error. If
+ strm->next_in is not Z_NULL, then the Z_BUF_ERROR was due to out() returning
+ non-zero. (in() will always be called before out(), so strm->next_in is
+ assured to be defined if out() returns non-zero.) Note that inflateBack()
+ cannot return Z_OK.
+*/
+
+ZEXTERN int ZEXPORT inflateBackEnd OF((z_streamp strm));
+/*
+ All memory allocated by inflateBackInit() is freed.
+
+ inflateBackEnd() returns Z_OK on success, or Z_STREAM_ERROR if the stream
+ state was inconsistent.
+*/
+
+ZEXTERN uLong ZEXPORT zlibCompileFlags OF((void));
+/* Return flags indicating compile-time options.
+
+ Type sizes, two bits each, 00 = 16 bits, 01 = 32, 10 = 64, 11 = other:
+ 1.0: size of uInt
+ 3.2: size of uLong
+ 5.4: size of voidpf (pointer)
+ 7.6: size of z_off_t
+
+ Compiler, assembler, and debug options:
+ 8: DEBUG
+ 9: ASMV or ASMINF -- use ASM code
+ 10: ZLIB_WINAPI -- exported functions use the WINAPI calling convention
+ 11: 0 (reserved)
+
+ One-time table building (smaller code, but not thread-safe if true):
+ 12: BUILDFIXED -- build static block decoding tables when needed
+ 13: DYNAMIC_CRC_TABLE -- build CRC calculation tables when needed
+ 14,15: 0 (reserved)
+
+ Library content (indicates missing functionality):
+ 16: NO_GZCOMPRESS -- gz* functions cannot compress (to avoid linking
+ deflate code when not needed)
+ 17: NO_GZIP -- deflate can't write gzip streams, and inflate can't detect
+ and decode gzip streams (to avoid linking crc code)
+ 18-19: 0 (reserved)
+
+ Operation variations (changes in library functionality):
+ 20: PKZIP_BUG_WORKAROUND -- slightly more permissive inflate
+ 21: FASTEST -- deflate algorithm with only one, lowest compression level
+ 22,23: 0 (reserved)
+
+ The sprintf variant used by gzprintf (zero is best):
+ 24: 0 = vs*, 1 = s* -- 1 means limited to 20 arguments after the format
+ 25: 0 = *nprintf, 1 = *printf -- 1 means gzprintf() not secure!
+ 26: 0 = returns value, 1 = void -- 1 means inferred string length returned
+
+ Remainder:
+ 27-31: 0 (reserved)
+ */
+
+#ifndef Z_SOLO
+
+ /* utility functions */
+
+/*
+ The following utility functions are implemented on top of the basic
+ stream-oriented functions. To simplify the interface, some default options
+ are assumed (compression level and memory usage, standard memory allocation
+ functions). The source code of these utility functions can be modified if
+ you need special options.
+*/
+
+ZEXTERN int ZEXPORT compress OF((Bytef *dest, uLongf *destLen,
+ const Bytef *source, uLong sourceLen));
+/*
+ Compresses the source buffer into the destination buffer. sourceLen is
+ the byte length of the source buffer. Upon entry, destLen is the total size
+ of the destination buffer, which must be at least the value returned by
+ compressBound(sourceLen). Upon exit, destLen is the actual size of the
+ compressed buffer.
+
+ compress returns Z_OK if success, Z_MEM_ERROR if there was not
+ enough memory, Z_BUF_ERROR if there was not enough room in the output
+ buffer.
+*/
+
+ZEXTERN int ZEXPORT compress2 OF((Bytef *dest, uLongf *destLen,
+ const Bytef *source, uLong sourceLen,
+ int level));
+/*
+ Compresses the source buffer into the destination buffer. The level
+ parameter has the same meaning as in deflateInit. sourceLen is the byte
+ length of the source buffer. Upon entry, destLen is the total size of the
+ destination buffer, which must be at least the value returned by
+ compressBound(sourceLen). Upon exit, destLen is the actual size of the
+ compressed buffer.
+
+ compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+ memory, Z_BUF_ERROR if there was not enough room in the output buffer,
+ Z_STREAM_ERROR if the level parameter is invalid.
+*/
+
+ZEXTERN uLong ZEXPORT compressBound OF((uLong sourceLen));
+/*
+ compressBound() returns an upper bound on the compressed size after
+ compress() or compress2() on sourceLen bytes. It would be used before a
+ compress() or compress2() call to allocate the destination buffer.
+*/
+
+ZEXTERN int ZEXPORT uncompress OF((Bytef *dest, uLongf *destLen,
+ const Bytef *source, uLong sourceLen));
+/*
+ Decompresses the source buffer into the destination buffer. sourceLen is
+ the byte length of the source buffer. Upon entry, destLen is the total size
+ of the destination buffer, which must be large enough to hold the entire
+ uncompressed data. (The size of the uncompressed data must have been saved
+ previously by the compressor and transmitted to the decompressor by some
+ mechanism outside the scope of this compression library.) Upon exit, destLen
+ is the actual size of the uncompressed buffer.
+
+ uncompress returns Z_OK if success, Z_MEM_ERROR if there was not
+ enough memory, Z_BUF_ERROR if there was not enough room in the output
+ buffer, or Z_DATA_ERROR if the input data was corrupted or incomplete. In
+ the case where there is not enough room, uncompress() will fill the output
+ buffer with the uncompressed data up to that point.
+*/
+
+ /* gzip file access functions */
+
+/*
+ This library supports reading and writing files in gzip (.gz) format with
+ an interface similar to that of stdio, using the functions that start with
+ "gz". The gzip format is different from the zlib format. gzip is a gzip
+ wrapper, documented in RFC 1952, wrapped around a deflate stream.
+*/
+
+typedef struct gzFile_s *gzFile; /* semi-opaque gzip file descriptor */
+
+/*
+ZEXTERN gzFile ZEXPORT gzopen OF((const char *path, const char *mode));
+
+ Opens a gzip (.gz) file for reading or writing. The mode parameter is as
+ in fopen ("rb" or "wb") but can also include a compression level ("wb9") or
+ a strategy: 'f' for filtered data as in "wb6f", 'h' for Huffman-only
+ compression as in "wb1h", 'R' for run-length encoding as in "wb1R", or 'F'
+ for fixed code compression as in "wb9F". (See the description of
+ deflateInit2 for more information about the strategy parameter.) 'T' will
+ request transparent writing or appending with no compression and not using
+ the gzip format.
+
+ "a" can be used instead of "w" to request that the gzip stream that will
+ be written be appended to the file. "+" will result in an error, since
+ reading and writing to the same gzip file is not supported. The addition of
+ "x" when writing will create the file exclusively, which fails if the file
+ already exists. On systems that support it, the addition of "e" when
+ reading or writing will set the flag to close the file on an execve() call.
+
+ These functions, as well as gzip, will read and decode a sequence of gzip
+ streams in a file. The append function of gzopen() can be used to create
+ such a file. (Also see gzflush() for another way to do this.) When
+ appending, gzopen does not test whether the file begins with a gzip stream,
+ nor does it look for the end of the gzip streams to begin appending. gzopen
+ will simply append a gzip stream to the existing file.
+
+ gzopen can be used to read a file which is not in gzip format; in this
+ case gzread will directly read from the file without decompression. When
+ reading, this will be detected automatically by looking for the magic two-
+ byte gzip header.
+
+ gzopen returns NULL if the file could not be opened, if there was
+ insufficient memory to allocate the gzFile state, or if an invalid mode was
+ specified (an 'r', 'w', or 'a' was not provided, or '+' was provided).
+ errno can be checked to determine if the reason gzopen failed was that the
+ file could not be opened.
+*/
+
+ZEXTERN gzFile ZEXPORT gzdopen OF((int fd, const char *mode));
+/*
+ gzdopen associates a gzFile with the file descriptor fd. File descriptors
+ are obtained from calls like open, dup, creat, pipe or fileno (if the file
+ has been previously opened with fopen). The mode parameter is as in gzopen.
+
+ The next call of gzclose on the returned gzFile will also close the file
+ descriptor fd, just like fclose(fdopen(fd, mode)) closes the file descriptor
+ fd. If you want to keep fd open, use fd = dup(fd_keep); gz = gzdopen(fd,
+ mode);. The duplicated descriptor should be saved to avoid a leak, since
+ gzdopen does not close fd if it fails. If you are using fileno() to get the
+ file descriptor from a FILE *, then you will have to use dup() to avoid
+ double-close()ing the file descriptor. Both gzclose() and fclose() will
+ close the associated file descriptor, so they need to have different file
+ descriptors.
+
+ gzdopen returns NULL if there was insufficient memory to allocate the
+ gzFile state, if an invalid mode was specified (an 'r', 'w', or 'a' was not
+ provided, or '+' was provided), or if fd is -1. The file descriptor is not
+ used until the next gz* read, write, seek, or close operation, so gzdopen
+ will not detect if fd is invalid (unless fd is -1).
+*/
+
+ZEXTERN int ZEXPORT gzbuffer OF((gzFile file, unsigned size));
+/*
+ Set the internal buffer size used by this library's functions. The
+ default buffer size is 8192 bytes. This function must be called after
+ gzopen() or gzdopen(), and before any other calls that read or write the
+ file. The buffer memory allocation is always deferred to the first read or
+ write. Two buffers are allocated, either both of the specified size when
+ writing, or one of the specified size and the other twice that size when
+ reading. A larger buffer size of, for example, 64K or 128K bytes will
+ noticeably increase the speed of decompression (reading).
+
+ The new buffer size also affects the maximum length for gzprintf().
+
+ gzbuffer() returns 0 on success, or -1 on failure, such as being called
+ too late.
+*/
+
+ZEXTERN int ZEXPORT gzsetparams OF((gzFile file, int level, int strategy));
+/*
+ Dynamically update the compression level or strategy. See the description
+ of deflateInit2 for the meaning of these parameters.
+
+ gzsetparams returns Z_OK if success, or Z_STREAM_ERROR if the file was not
+ opened for writing.
+*/
+
+ZEXTERN int ZEXPORT gzread OF((gzFile file, voidp buf, unsigned len));
+/*
+ Reads the given number of uncompressed bytes from the compressed file. If
+ the input file is not in gzip format, gzread copies the given number of
+ bytes into the buffer directly from the file.
+
+ After reaching the end of a gzip stream in the input, gzread will continue
+ to read, looking for another gzip stream. Any number of gzip streams may be
+ concatenated in the input file, and will all be decompressed by gzread().
+ If something other than a gzip stream is encountered after a gzip stream,
+ that remaining trailing garbage is ignored (and no error is returned).
+
+ gzread can be used to read a gzip file that is being concurrently written.
+ Upon reaching the end of the input, gzread will return with the available
+ data. If the error code returned by gzerror is Z_OK or Z_BUF_ERROR, then
+ gzclearerr can be used to clear the end of file indicator in order to permit
+ gzread to be tried again. Z_OK indicates that a gzip stream was completed
+ on the last gzread. Z_BUF_ERROR indicates that the input file ended in the
+ middle of a gzip stream. Note that gzread does not return -1 in the event
+ of an incomplete gzip stream. This error is deferred until gzclose(), which
+ will return Z_BUF_ERROR if the last gzread ended in the middle of a gzip
+ stream. Alternatively, gzerror can be used before gzclose to detect this
+ case.
+
+ gzread returns the number of uncompressed bytes actually read, less than
+ len for end of file, or -1 for error.
+*/
+
+ZEXTERN int ZEXPORT gzwrite OF((gzFile file,
+ voidpc buf, unsigned len));
+/*
+ Writes the given number of uncompressed bytes into the compressed file.
+ gzwrite returns the number of uncompressed bytes written or 0 in case of
+ error.
+*/
+
+ZEXTERN int ZEXPORTVA gzprintf Z_ARG((gzFile file, const char *format, ...));
+/*
+ Converts, formats, and writes the arguments to the compressed file under
+ control of the format string, as in fprintf. gzprintf returns the number of
+ uncompressed bytes actually written, or 0 in case of error. The number of
+ uncompressed bytes written is limited to 8191, or one less than the buffer
+ size given to gzbuffer(). The caller should assure that this limit is not
+ exceeded. If it is exceeded, then gzprintf() will return an error (0) with
+ nothing written. In this case, there may also be a buffer overflow with
+ unpredictable consequences, which is possible only if zlib was compiled with
+ the insecure functions sprintf() or vsprintf() because the secure snprintf()
+ or vsnprintf() functions were not available. This can be determined using
+ zlibCompileFlags().
+*/
+
+ZEXTERN int ZEXPORT gzputs OF((gzFile file, const char *s));
+/*
+ Writes the given null-terminated string to the compressed file, excluding
+ the terminating null character.
+
+ gzputs returns the number of characters written, or -1 in case of error.
+*/
+
+ZEXTERN char * ZEXPORT gzgets OF((gzFile file, char *buf, int len));
+/*
+ Reads bytes from the compressed file until len-1 characters are read, or a
+ newline character is read and transferred to buf, or an end-of-file
+ condition is encountered. If any characters are read or if len == 1, the
+ string is terminated with a null character. If no characters are read due
+ to an end-of-file or len < 1, then the buffer is left untouched.
+
+ gzgets returns buf which is a null-terminated string, or it returns NULL
+ for end-of-file or in case of error. If there was an error, the contents at
+ buf are indeterminate.
+*/
+
+ZEXTERN int ZEXPORT gzputc OF((gzFile file, int c));
+/*
+ Writes c, converted to an unsigned char, into the compressed file. gzputc
+ returns the value that was written, or -1 in case of error.
+*/
+
+ZEXTERN int ZEXPORT gzgetc OF((gzFile file));
+/*
+ Reads one byte from the compressed file. gzgetc returns this byte or -1
+ in case of end of file or error. This is implemented as a macro for speed.
+ As such, it does not do all of the checking the other functions do. I.e.
+ it does not check to see if file is NULL, nor whether the structure file
+ points to has been clobbered or not.
+*/
+
+ZEXTERN int ZEXPORT gzungetc OF((int c, gzFile file));
+/*
+ Push one character back onto the stream to be read as the first character
+ on the next read. At least one character of push-back is allowed.
+ gzungetc() returns the character pushed, or -1 on failure. gzungetc() will
+ fail if c is -1, and may fail if a character has been pushed but not read
+ yet. If gzungetc is used immediately after gzopen or gzdopen, at least the
+ output buffer size of pushed characters is allowed. (See gzbuffer above.)
+ The pushed character will be discarded if the stream is repositioned with
+ gzseek() or gzrewind().
+*/
+
+ZEXTERN int ZEXPORT gzflush OF((gzFile file, int flush));
+/*
+ Flushes all pending output into the compressed file. The parameter flush
+ is as in the deflate() function. The return value is the zlib error number
+ (see function gzerror below). gzflush is only permitted when writing.
+
+ If the flush parameter is Z_FINISH, the remaining data is written and the
+ gzip stream is completed in the output. If gzwrite() is called again, a new
+ gzip stream will be started in the output. gzread() is able to read such
+ concatented gzip streams.
+
+ gzflush should be called only when strictly necessary because it will
+ degrade compression if called too often.
+*/
+
+/*
+ZEXTERN z_off_t ZEXPORT gzseek OF((gzFile file,
+ z_off_t offset, int whence));
+
+ Sets the starting position for the next gzread or gzwrite on the given
+ compressed file. The offset represents a number of bytes in the
+ uncompressed data stream. The whence parameter is defined as in lseek(2);
+ the value SEEK_END is not supported.
+
+ If the file is opened for reading, this function is emulated but can be
+ extremely slow. If the file is opened for writing, only forward seeks are
+ supported; gzseek then compresses a sequence of zeroes up to the new
+ starting position.
+
+ gzseek returns the resulting offset location as measured in bytes from
+ the beginning of the uncompressed stream, or -1 in case of error, in
+ particular if the file is opened for writing and the new starting position
+ would be before the current position.
+*/
+
+ZEXTERN int ZEXPORT gzrewind OF((gzFile file));
+/*
+ Rewinds the given file. This function is supported only for reading.
+
+ gzrewind(file) is equivalent to (int)gzseek(file, 0L, SEEK_SET)
+*/
+
+/*
+ZEXTERN z_off_t ZEXPORT gztell OF((gzFile file));
+
+ Returns the starting position for the next gzread or gzwrite on the given
+ compressed file. This position represents a number of bytes in the
+ uncompressed data stream, and is zero when starting, even if appending or
+ reading a gzip stream from the middle of a file using gzdopen().
+
+ gztell(file) is equivalent to gzseek(file, 0L, SEEK_CUR)
+*/
+
+/*
+ZEXTERN z_off_t ZEXPORT gzoffset OF((gzFile file));
+
+ Returns the current offset in the file being read or written. This offset
+ includes the count of bytes that precede the gzip stream, for example when
+ appending or when using gzdopen() for reading. When reading, the offset
+ does not include as yet unused buffered input. This information can be used
+ for a progress indicator. On error, gzoffset() returns -1.
+*/
+
+ZEXTERN int ZEXPORT gzeof OF((gzFile file));
+/*
+ Returns true (1) if the end-of-file indicator has been set while reading,
+ false (0) otherwise. Note that the end-of-file indicator is set only if the
+ read tried to go past the end of the input, but came up short. Therefore,
+ just like feof(), gzeof() may return false even if there is no more data to
+ read, in the event that the last read request was for the exact number of
+ bytes remaining in the input file. This will happen if the input file size
+ is an exact multiple of the buffer size.
+
+ If gzeof() returns true, then the read functions will return no more data,
+ unless the end-of-file indicator is reset by gzclearerr() and the input file
+ has grown since the previous end of file was detected.
+*/
+
+ZEXTERN int ZEXPORT gzdirect OF((gzFile file));
+/*
+ Returns true (1) if file is being copied directly while reading, or false
+ (0) if file is a gzip stream being decompressed.
+
+ If the input file is empty, gzdirect() will return true, since the input
+ does not contain a gzip stream.
+
+ If gzdirect() is used immediately after gzopen() or gzdopen() it will
+ cause buffers to be allocated to allow reading the file to determine if it
+ is a gzip file. Therefore if gzbuffer() is used, it should be called before
+ gzdirect().
+
+ When writing, gzdirect() returns true (1) if transparent writing was
+ requested ("wT" for the gzopen() mode), or false (0) otherwise. (Note:
+ gzdirect() is not needed when writing. Transparent writing must be
+ explicitly requested, so the application already knows the answer. When
+ linking statically, using gzdirect() will include all of the zlib code for
+ gzip file reading and decompression, which may not be desired.)
+*/
+
+ZEXTERN int ZEXPORT gzclose OF((gzFile file));
+/*
+ Flushes all pending output if necessary, closes the compressed file and
+ deallocates the (de)compression state. Note that once file is closed, you
+ cannot call gzerror with file, since its structures have been deallocated.
+ gzclose must not be called more than once on the same file, just as free
+ must not be called more than once on the same allocation.
+
+ gzclose will return Z_STREAM_ERROR if file is not valid, Z_ERRNO on a
+ file operation error, Z_MEM_ERROR if out of memory, Z_BUF_ERROR if the
+ last read ended in the middle of a gzip stream, or Z_OK on success.
+*/
+
+ZEXTERN int ZEXPORT gzclose_r OF((gzFile file));
+ZEXTERN int ZEXPORT gzclose_w OF((gzFile file));
+/*
+ Same as gzclose(), but gzclose_r() is only for use when reading, and
+ gzclose_w() is only for use when writing or appending. The advantage to
+ using these instead of gzclose() is that they avoid linking in zlib
+ compression or decompression code that is not used when only reading or only
+ writing respectively. If gzclose() is used, then both compression and
+ decompression code will be included the application when linking to a static
+ zlib library.
+*/
+
+ZEXTERN const char * ZEXPORT gzerror OF((gzFile file, int *errnum));
+/*
+ Returns the error message for the last error which occurred on the given
+ compressed file. errnum is set to zlib error number. If an error occurred
+ in the file system and not in the compression library, errnum is set to
+ Z_ERRNO and the application may consult errno to get the exact error code.
+
+ The application must not modify the returned string. Future calls to
+ this function may invalidate the previously returned string. If file is
+ closed, then the string previously returned by gzerror will no longer be
+ available.
+
+ gzerror() should be used to distinguish errors from end-of-file for those
+ functions above that do not distinguish those cases in their return values.
+*/
+
+ZEXTERN void ZEXPORT gzclearerr OF((gzFile file));
+/*
+ Clears the error and end-of-file flags for file. This is analogous to the
+ clearerr() function in stdio. This is useful for continuing to read a gzip
+ file that is being written concurrently.
+*/
+
+#endif /* !Z_SOLO */
+
+ /* checksum functions */
+
+/*
+ These functions are not related to compression but are exported
+ anyway because they might be useful in applications using the compression
+ library.
+*/
+
+ZEXTERN uLong ZEXPORT adler32 OF((uLong adler, const Bytef *buf, uInt len));
+/*
+ Update a running Adler-32 checksum with the bytes buf[0..len-1] and
+ return the updated checksum. If buf is Z_NULL, this function returns the
+ required initial value for the checksum.
+
+ An Adler-32 checksum is almost as reliable as a CRC32 but can be computed
+ much faster.
+
+ Usage example:
+
+ uLong adler = adler32(0L, Z_NULL, 0);
+
+ while (read_buffer(buffer, length) != EOF) {
+ adler = adler32(adler, buffer, length);
+ }
+ if (adler != original_adler) error();
+*/
+
+/*
+ZEXTERN uLong ZEXPORT adler32_combine OF((uLong adler1, uLong adler2,
+ z_off_t len2));
+
+ Combine two Adler-32 checksums into one. For two sequences of bytes, seq1
+ and seq2 with lengths len1 and len2, Adler-32 checksums were calculated for
+ each, adler1 and adler2. adler32_combine() returns the Adler-32 checksum of
+ seq1 and seq2 concatenated, requiring only adler1, adler2, and len2. Note
+ that the z_off_t type (like off_t) is a signed integer. If len2 is
+ negative, the result has no meaning or utility.
+*/
+
+ZEXTERN uLong ZEXPORT crc32 OF((uLong crc, const Bytef *buf, uInt len));
+/*
+ Update a running CRC-32 with the bytes buf[0..len-1] and return the
+ updated CRC-32. If buf is Z_NULL, this function returns the required
+ initial value for the crc. Pre- and post-conditioning (one's complement) is
+ performed within this function so it shouldn't be done by the application.
+
+ Usage example:
+
+ uLong crc = crc32(0L, Z_NULL, 0);
+
+ while (read_buffer(buffer, length) != EOF) {
+ crc = crc32(crc, buffer, length);
+ }
+ if (crc != original_crc) error();
+*/
+
+/*
+ZEXTERN uLong ZEXPORT crc32_combine OF((uLong crc1, uLong crc2, z_off_t len2));
+
+ Combine two CRC-32 check values into one. For two sequences of bytes,
+ seq1 and seq2 with lengths len1 and len2, CRC-32 check values were
+ calculated for each, crc1 and crc2. crc32_combine() returns the CRC-32
+ check value of seq1 and seq2 concatenated, requiring only crc1, crc2, and
+ len2.
+*/
+
+
+ /* various hacks, don't look :) */
+
+/* deflateInit and inflateInit are macros to allow checking the zlib version
+ * and the compiler's view of z_stream:
+ */
+ZEXTERN int ZEXPORT deflateInit_ OF((z_streamp strm, int level,
+ const char *version, int stream_size));
+ZEXTERN int ZEXPORT inflateInit_ OF((z_streamp strm,
+ const char *version, int stream_size));
+ZEXTERN int ZEXPORT deflateInit2_ OF((z_streamp strm, int level, int method,
+ int windowBits, int memLevel,
+ int strategy, const char *version,
+ int stream_size));
+ZEXTERN int ZEXPORT inflateInit2_ OF((z_streamp strm, int windowBits,
+ const char *version, int stream_size));
+ZEXTERN int ZEXPORT inflateBackInit_ OF((z_streamp strm, int windowBits,
+ unsigned char FAR *window,
+ const char *version,
+ int stream_size));
+#define deflateInit(strm, level) \
+ deflateInit_((strm), (level), ZLIB_VERSION, (int)sizeof(z_stream))
+#define inflateInit(strm) \
+ inflateInit_((strm), ZLIB_VERSION, (int)sizeof(z_stream))
+#define deflateInit2(strm, level, method, windowBits, memLevel, strategy) \
+ deflateInit2_((strm),(level),(method),(windowBits),(memLevel),\
+ (strategy), ZLIB_VERSION, (int)sizeof(z_stream))
+#define inflateInit2(strm, windowBits) \
+ inflateInit2_((strm), (windowBits), ZLIB_VERSION, \
+ (int)sizeof(z_stream))
+#define inflateBackInit(strm, windowBits, window) \
+ inflateBackInit_((strm), (windowBits), (window), \
+ ZLIB_VERSION, (int)sizeof(z_stream))
+
+#ifndef Z_SOLO
+
+/* gzgetc() macro and its supporting function and exposed data structure. Note
+ * that the real internal state is much larger than the exposed structure.
+ * This abbreviated structure exposes just enough for the gzgetc() macro. The
+ * user should not mess with these exposed elements, since their names or
+ * behavior could change in the future, perhaps even capriciously. They can
+ * only be used by the gzgetc() macro. You have been warned.
+ */
+struct gzFile_s {
+ unsigned have;
+ unsigned char *next;
+ z_off64_t pos;
+};
+ZEXTERN int ZEXPORT gzgetc_ OF((gzFile file)); /* backward compatibility */
+#ifdef Z_PREFIX_SET
+# undef z_gzgetc
+# define z_gzgetc(g) \
+ ((g)->have ? ((g)->have--, (g)->pos++, *((g)->next)++) : gzgetc(g))
+#else
+# define gzgetc(g) \
+ ((g)->have ? ((g)->have--, (g)->pos++, *((g)->next)++) : gzgetc(g))
+#endif
+
+/* provide 64-bit offset functions if _LARGEFILE64_SOURCE defined, and/or
+ * change the regular functions to 64 bits if _FILE_OFFSET_BITS is 64 (if
+ * both are true, the application gets the *64 functions, and the regular
+ * functions are changed to 64 bits) -- in case these are set on systems
+ * without large file support, _LFS64_LARGEFILE must also be true
+ */
+#ifdef Z_LARGE64
+ ZEXTERN gzFile ZEXPORT gzopen64 OF((const char *, const char *));
+ ZEXTERN z_off64_t ZEXPORT gzseek64 OF((gzFile, z_off64_t, int));
+ ZEXTERN z_off64_t ZEXPORT gztell64 OF((gzFile));
+ ZEXTERN z_off64_t ZEXPORT gzoffset64 OF((gzFile));
+ ZEXTERN uLong ZEXPORT adler32_combine64 OF((uLong, uLong, z_off64_t));
+ ZEXTERN uLong ZEXPORT crc32_combine64 OF((uLong, uLong, z_off64_t));
+#endif
+
+#if !defined(ZLIB_INTERNAL) && defined(Z_WANT64)
+# ifdef Z_PREFIX_SET
+# define z_gzopen z_gzopen64
+# define z_gzseek z_gzseek64
+# define z_gztell z_gztell64
+# define z_gzoffset z_gzoffset64
+# define z_adler32_combine z_adler32_combine64
+# define z_crc32_combine z_crc32_combine64
+# else
+# define gzopen gzopen64
+# define gzseek gzseek64
+# define gztell gztell64
+# define gzoffset gzoffset64
+# define adler32_combine adler32_combine64
+# define crc32_combine crc32_combine64
+# endif
+# ifndef Z_LARGE64
+ ZEXTERN gzFile ZEXPORT gzopen64 OF((const char *, const char *));
+ ZEXTERN z_off_t ZEXPORT gzseek64 OF((gzFile, z_off_t, int));
+ ZEXTERN z_off_t ZEXPORT gztell64 OF((gzFile));
+ ZEXTERN z_off_t ZEXPORT gzoffset64 OF((gzFile));
+ ZEXTERN uLong ZEXPORT adler32_combine64 OF((uLong, uLong, z_off_t));
+ ZEXTERN uLong ZEXPORT crc32_combine64 OF((uLong, uLong, z_off_t));
+# endif
+#else
+ ZEXTERN gzFile ZEXPORT gzopen OF((const char *, const char *));
+ ZEXTERN z_off_t ZEXPORT gzseek OF((gzFile, z_off_t, int));
+ ZEXTERN z_off_t ZEXPORT gztell OF((gzFile));
+ ZEXTERN z_off_t ZEXPORT gzoffset OF((gzFile));
+ ZEXTERN uLong ZEXPORT adler32_combine OF((uLong, uLong, z_off_t));
+ ZEXTERN uLong ZEXPORT crc32_combine OF((uLong, uLong, z_off_t));
+#endif
+
+#else /* Z_SOLO */
+
+ ZEXTERN uLong ZEXPORT adler32_combine OF((uLong, uLong, z_off_t));
+ ZEXTERN uLong ZEXPORT crc32_combine OF((uLong, uLong, z_off_t));
+
+#endif /* !Z_SOLO */
+
+/* hack for buggy compilers */
+#if !defined(ZUTIL_H) && !defined(NO_DUMMY_DECL)
+ struct internal_state {int dummy;};
+#endif
+
+/* undocumented functions */
+ZEXTERN const char * ZEXPORT zError OF((int));
+ZEXTERN int ZEXPORT inflateSyncPoint OF((z_streamp));
+ZEXTERN const z_crc_t FAR * ZEXPORT get_crc_table OF((void));
+ZEXTERN int ZEXPORT inflateUndermine OF((z_streamp, int));
+ZEXTERN int ZEXPORT inflateResetKeep OF((z_streamp));
+ZEXTERN int ZEXPORT deflateResetKeep OF((z_streamp));
+#if defined(_WIN32) && !defined(Z_SOLO)
+ZEXTERN gzFile ZEXPORT gzopen_w OF((const wchar_t *path,
+ const char *mode));
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ZLIB_H */
diff --git a/kmc_tools/libs/zlibstat.lib b/kmc_tools/libs/zlibstat.lib
new file mode 100644
index 0000000..f33db8e
Binary files /dev/null and b/kmc_tools/libs/zlibstat.lib differ
diff --git a/kmer_counter/meta_oper.h b/kmc_tools/meta_oper.h
similarity index 64%
copy from kmer_counter/meta_oper.h
copy to kmc_tools/meta_oper.h
index 9d82a1d..d2e29ff 100644
--- a/kmer_counter/meta_oper.h
+++ b/kmc_tools/meta_oper.h
@@ -1,11 +1,11 @@
/*
- This file is a part of KMC software distributed under GNU GPL 3 licence.
- The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
-
- Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
-
- Version: 2.2.0
- Date : 2015-04-15
+This file is a part of KMC software distributed under GNU GPL 3 licence.
+The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
+
+Version: 2.3.0
+Date : 2015-08-21
*/
#ifndef _META_OPER_H
@@ -19,7 +19,7 @@ template <size_t N> struct uint_{ };
// For loop (forward)
template <size_t N, typename Lambda>
inline void IterFwd(const Lambda &oper, uint_<N>) {
- IterFwd(oper, uint_<N-1>());
+ IterFwd(oper, uint_<N - 1>());
oper(N);
}
@@ -32,7 +32,7 @@ inline void IterFwd(const Lambda &oper, uint_<0>) {
template <size_t N, typename Lambda>
inline void IterRev(const Lambda &oper, uint_<N>) {
oper(N);
- IterRev(oper, uint_<N-1>());
+ IterRev(oper, uint_<N - 1>());
}
template <typename Lambda>
diff --git a/kmc_tools/nc_utils.cpp b/kmc_tools/nc_utils.cpp
new file mode 100644
index 0000000..ba90c91
--- /dev/null
+++ b/kmc_tools/nc_utils.cpp
@@ -0,0 +1,17 @@
+/*
+ This file is a part of KMC software distributed under GNU GPL 3 licence.
+ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+ Authors: Marek Kokot
+
+ Version: 2.3.0
+ Date : 2015-08-21
+*/
+
+#include "stdafx.h"
+#include "nc_utils.h"
+
+
+uchar CNumericConversions::digits[100000*5];
+int CNumericConversions::powOf10[30];
+CNumericConversions::_si CNumericConversions::_init;
\ No newline at end of file
diff --git a/kmc_dump/nc_utils.h b/kmc_tools/nc_utils.h
similarity index 79%
copy from kmc_dump/nc_utils.h
copy to kmc_tools/nc_utils.h
index fd0ebb3..fc23c57 100644
--- a/kmc_dump/nc_utils.h
+++ b/kmc_tools/nc_utils.h
@@ -1,18 +1,16 @@
/*
This file is a part of KMC software distributed under GNU GPL 3 licence.
The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
-
- This file demonstrates the example usage of kmc_api software.
- It reads kmer_counter's output and prints kmers to an output file.
-
- Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
-
- Version: 2.2.0
- Date : 2015-04-15
+
+ Authors: Marek Kokot
+
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#include <string>
-#include "../kmc_api/kmer_defs.h"
+#include "defs.h"
+#include <cstring>
#ifndef _NC_UTILS_H
#define _NC_UTILS_H
@@ -115,24 +113,6 @@ public:
return ndig;
}
}
-
- static int Double2PChar(double val, int prec, uchar *str)
- {
- double corrector = .5 / powOf10[prec];
- val += corrector;
- double ipart;
- double fractPart = std::modf(val, &ipart);
- uint32 intPart = (uint32)ipart;
- uint32 len = Int2PChar(intPart, str);
- uint32 pos = len;
- str[pos++] = '.';
- for(int i = 0 ; i < prec ; ++i)
- {
- fractPart *= 10;
- str[pos++] = '0' + (uint32)fractPart % 10 ;
- }
- return len + prec + 1;
- }
};
#endif
\ No newline at end of file
diff --git a/kmc_tools/operations.h b/kmc_tools/operations.h
new file mode 100644
index 0000000..4eef01c
--- /dev/null
+++ b/kmc_tools/operations.h
@@ -0,0 +1,284 @@
+/*
+ This file is a part of KMC software distributed under GNU GPL 3 licence.
+ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+ Authors: Marek Kokot
+
+ Version: 2.3.0
+ Date : 2015-08-21
+*/
+
+#ifndef _OPERATIONS_H
+#define _OPERATIONS_H
+
+
+#ifdef ENABLE_DEBUG
+#include "config.h"
+#endif
+#include <iostream>
+#include "bundle.h"
+
+//************************************************************************************************************
+// C2ArgOper - abstract class representing 2 argument's operation
+//************************************************************************************************************
+template<unsigned SIZE> class C2ArgOper : public CInput<SIZE>
+{
+protected:
+ CBundle<SIZE>* input1, *input2;
+public:
+ C2ArgOper(CBundle<SIZE>* input1, CBundle<SIZE>* input2) :
+ input1(input1), input2(input2)
+ {
+ }
+
+ void IgnoreRest() override
+ {
+ input1->IgnoreRest();
+ input2->IgnoreRest();
+ }
+
+ ~C2ArgOper() override
+ {
+ delete input1;
+ delete input2;
+ }
+};
+
+//************************************************************************************************************
+// CUnion - implementation of union operation on 2 k-mer's sets.
+//************************************************************************************************************
+template <unsigned SIZE> class CUnion : public C2ArgOper<SIZE>
+{
+public:
+ CUnion(CBundle<SIZE>* input1, CBundle<SIZE>* input2) : C2ArgOper<SIZE>(input1, input2)
+ {
+ }
+ void NextBundle(CBundle<SIZE>& bundle) override
+ {
+ while (!this->input1->Finished() && !this->input2->Finished())
+ {
+ if (bundle.Full())
+ {
+ return;
+ }
+ if (this->input1->TopKmer() == this->input2->TopKmer())
+ {
+ bundle.Insert(this->input1->TopKmer(), this->input1->TopCounter() + this->input2->TopCounter());
+ this->input1->Pop();
+ this->input2->Pop();
+ }
+ else if (this->input1->TopKmer() < this->input2->TopKmer())
+ {
+ bundle.Insert(this->input1->TopKmer(), this->input1->TopCounter());
+ this->input1->Pop();
+ }
+ else
+ {
+ bundle.Insert(this->input2->TopKmer(), this->input2->TopCounter());
+ this->input2->Pop();
+ }
+ }
+ CBundle<SIZE>* non_empty_bundle = this->input1->Finished() ? this->input2 : this->input1;
+ while (!non_empty_bundle->Finished())
+ {
+ if (bundle.Full())
+ {
+ return;
+ }
+ bundle.Insert(non_empty_bundle->TopKmer(), non_empty_bundle->TopCounter());
+ non_empty_bundle->Pop();
+ }
+ this->finished = true;
+ }
+};
+
+//************************************************************************************************************
+// CIntersection - implementation of intersection operation on 2 k-mer's sets.
+//************************************************************************************************************
+template<unsigned SIZE> class CIntersection : public C2ArgOper<SIZE>
+{
+public:
+ CIntersection(CBundle<SIZE>* input1, CBundle<SIZE>* input2) : C2ArgOper<SIZE>(input1, input2)
+ {
+ }
+ void NextBundle(CBundle<SIZE>& bundle) override
+ {
+ while (!this->input1->Finished() && !this->input2->Finished())
+ {
+ /*this->input1->Top(kmer1, counter1);
+ this->input2->Top(kmer2, counter2);*/
+
+ if (this->input1->TopKmer() == this->input2->TopKmer())
+ {
+ bundle.Insert(this->input1->TopKmer(), MIN(this->input1->TopCounter(), this->input2->TopCounter()));
+ this->input1->Pop();
+ this->input2->Pop();
+ if (bundle.Full())
+ return;
+ }
+ else if (this->input1->TopKmer() < this->input2->TopKmer())
+ this->input1->Pop();
+ else
+ this->input2->Pop();
+ }
+ if (!this->input1->Finished())
+ this->input1->IgnoreRest();
+ if (!this->input2->Finished())
+ this->input2->IgnoreRest();
+ this->finished = true;
+ }
+};
+
+//************************************************************************************************************
+// CKmersSubtract - implementation of subtraction operation of 2 k-mer's sets.
+// If k-mer exists in both input it is absent in result (counters does not matter).
+//************************************************************************************************************
+template<unsigned SIZE> class CKmersSubtract : public C2ArgOper<SIZE>
+{
+ //CKmer<SIZE> kmer1, kmer2;
+ //uint32 counter1, counter2;
+public:
+ CKmersSubtract(CBundle<SIZE>* input1, CBundle<SIZE>* input2) : C2ArgOper<SIZE>(input1, input2)
+ {
+ }
+ void NextBundle(CBundle<SIZE>& bundle) override
+ {
+ while (!this->input1->Finished() && !this->input2->Finished())
+ {
+ //this->input1->Top(kmer1, counter1);
+ //this->input2->Top(kmer2, counter2);
+ if (this->input2->TopKmer() < this->input1->TopKmer())
+ this->input2->Pop();
+ else if (this->input2->TopKmer() == this->input1->TopKmer())
+ {
+ this->input1->Pop();
+ this->input2->Pop();
+ }
+ else
+ {
+ bundle.Insert(this->input1->TopKmer(), this->input1->TopCounter());
+ this->input1->Pop();
+ if (bundle.Full())
+ return;
+ }
+ }
+
+ if(!this->input2->Finished())
+ this->input2->IgnoreRest();
+
+ while (!this->input1->Finished())
+ {
+ if (bundle.Full())
+ return;
+ bundle.Insert(this->input1->TopKmer(), this->input1->TopCounter());
+ this->input1->Pop();
+ }
+ this->finished = true;
+ }
+};
+
+
+
+
+//************************************************************************************************************
+// CCountersSubtract - implementation of subtraction operation of 2 k-mer's sets.
+// If k-mer exists in both input their counters are subtracted.
+//************************************************************************************************************
+template<unsigned SIZE> class CCountersSubtract : public C2ArgOper<SIZE>
+{
+ //CKmer<SIZE> kmer1, kmer2;
+ //uint32 counter1, counter2;
+public:
+ CCountersSubtract(CBundle<SIZE>* input1, CBundle<SIZE>* input2) : C2ArgOper<SIZE>(input1, input2)
+ {
+ }
+ void NextBundle(CBundle<SIZE>& bundle) override
+ {
+ while (!this->input1->Finished() && !this->input2->Finished())
+ {
+ //this->input1->Top(kmer1, counter1);
+ //this->input2->Top(kmer2, counter2);
+ if (this->input2->TopKmer() < this->input1->TopKmer())
+ this->input2->Pop();
+ else if (this->input2->TopKmer() == this->input1->TopKmer())
+ {
+ if (this->input1->TopCounter() > this->input2->TopCounter())
+ {
+ bundle.Insert(this->input1->TopKmer(), this->input1->TopCounter() - this->input2->TopCounter());
+ this->input1->Pop();
+ this->input2->Pop();
+ if (bundle.Full())
+ return;
+ }
+ else
+ {
+ this->input1->Pop();
+ this->input2->Pop();
+ }
+ }
+ else
+ {
+ bundle.Insert(this->input1->TopKmer(), this->input1->TopCounter());
+ this->input1->Pop();
+ if (bundle.Full())
+ return;
+ }
+ }
+
+ if (!this->input2->Finished())
+ this->input2->IgnoreRest();
+
+ while (!this->input1->Finished())
+ {
+ if (bundle.Full())
+ return;
+ bundle.Insert(this->input1->TopKmer(), this->input1->TopCounter());
+ this->input1->Pop();
+ }
+ this->finished = true;
+ }
+};
+
+template<unsigned SIZE> class CComparer
+{
+ CBundle<SIZE>* input1, *input2;
+public:
+ CComparer(CBundle<SIZE>* input1, CBundle<SIZE>* input2) : input1(input1), input2(input2)
+ {
+ }
+
+ bool Equals()
+ {
+ while (!this->input1->Finished() && !this->input2->Finished())
+ {
+ if (this->input1->TopCounter() != this->input2->TopCounter())
+ {
+ this->input1->IgnoreRest();
+ this->input2->IgnoreRest();
+ return false;
+ }
+ if (!(this->input1->TopKmer() == this->input2->TopKmer()))
+ {
+ this->input1->IgnoreRest();
+ this->input2->IgnoreRest();
+ return false;
+ }
+
+ this->input1->Pop();
+ this->input2->Pop();
+ }
+ if (!this->input1->Finished() || !this->input2->Finished())
+ {
+ std::cout << "one of input is not finished\n";
+ this->input1->IgnoreRest();
+ this->input2->IgnoreRest();
+ return false;
+ }
+ return true;
+ }
+};
+
+
+#endif
+
+// ***** EOF
\ No newline at end of file
diff --git a/kmc_tools/output_parser.h b/kmc_tools/output_parser.h
new file mode 100644
index 0000000..326cfdd
--- /dev/null
+++ b/kmc_tools/output_parser.h
@@ -0,0 +1,174 @@
+/*
+ This file is a part of KMC software distributed under GNU GPL 3 licence.
+ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+ Authors: Marek Kokot
+
+ Version: 2.3.0
+ Date : 2015-08-21
+*/
+
+#ifndef _OUTPUT_PARSER_H
+#define _OUTPUT_PARSER_H
+
+#include "defs.h"
+#include <list>
+#include <map>
+#include "tokenizer.h"
+#include "expression_node.h"
+
+
+/*****************************************************************************************************************************/
+// This parser validate below grammar:
+// expr -> term sum_op
+// sum_op -> PLUSMINUS term sum_op
+// sum_op -> TERMINATOR
+//
+// term -> argument term_op
+// term_op -> MUL argument term_op
+// term_op -> TERMINATOR
+// argument -> VARIABLE
+// argument -> OPEN_BRACKET expr CLOSE_BRACKET
+// This code is based on: https://github.com/mikailsheikh/cogitolearning-examples/tree/master/CogPar
+/*****************************************************************************************************************************/
+
+template<unsigned SIZE> class COutputParser
+{
+ std::list<Token> tokens;
+ const std::map<std::string, uint32>& input;
+ Token curr_token;
+ void nextToken();
+ CExpressionNode<SIZE>* argument();
+ CExpressionNode<SIZE>* term_op(CExpressionNode<SIZE>* left);
+ CExpressionNode<SIZE>* term();
+ CExpressionNode<SIZE>* sum_op(CExpressionNode<SIZE>* left);
+ CExpressionNode<SIZE>* expr();
+public:
+ COutputParser(std::list<Token>& tokens, const std::map<std::string, uint32>& input) :
+ tokens(tokens), input(input)
+ {
+ curr_token = tokens.front();
+ }
+
+ CExpressionNode<SIZE>* Parse();
+};
+
+
+/*****************************************************************************************************************************/
+/********************************************************** PUBLIC ***********************************************************/
+/*****************************************************************************************************************************/
+
+template<unsigned SIZE>
+CExpressionNode<SIZE>* COutputParser<SIZE>::Parse()
+{
+ CExpressionNode<SIZE>* res = expr();
+ if (curr_token.second != TokenType::TERMINATOR)
+ {
+ std::cout << "Error: wrong symbol :" << curr_token.first;
+ exit(1);
+ }
+#ifdef ENABLE_DEBUG
+ std::cout << "\n";
+ res->Display();
+#endif
+ return res;
+}
+
+/*****************************************************************************************************************************/
+/********************************************************** PRIVATE **********************************************************/
+/*****************************************************************************************************************************/
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> void COutputParser<SIZE>::nextToken()
+{
+ tokens.pop_front();
+ if (tokens.empty())
+ curr_token.second = TokenType::TERMINATOR;
+ else
+ curr_token = tokens.front();
+}
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> CExpressionNode<SIZE>* COutputParser<SIZE>::argument()
+{
+ if (curr_token.second == TokenType::VARIABLE)
+ {
+ //check if this variable was defined
+ auto elem = input.find(curr_token.first);
+ if (elem == input.end())
+ {
+ std::cout << "Error: variable " << curr_token.first << " was not defined\n";
+ exit(1);
+ }
+ CExpressionNode<SIZE>* res = new CInputNode<SIZE>(elem->second);
+ nextToken();
+ return res;
+ }
+ else if (curr_token.second == TokenType::PARENTHESIS_OPEN)
+ {
+ nextToken();
+ CExpressionNode<SIZE>* res = expr();
+ if (curr_token.second != TokenType::PARENTHESIS_CLOSE)
+ {
+ std::cout << "Error: close parenthesis expected, but " << curr_token.first << " found\n";
+ exit(1);
+ }
+ nextToken();
+ return res;
+ }
+ return nullptr;
+}
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> CExpressionNode<SIZE>* COutputParser<SIZE>::term_op(CExpressionNode<SIZE>* left)
+{
+ if (curr_token.second == TokenType::MUL_OPER)
+ {
+ CExpressionNode<SIZE>* res = new CIntersectionNode<SIZE>;
+ res->AddLeftChild(left);
+ nextToken();
+ auto right = argument();
+ res->AddRightChild(right);
+ return term_op(res);
+ }
+ return left;
+}
+template<unsigned SIZE> CExpressionNode<SIZE>* COutputParser<SIZE>::term()
+{
+ auto left = argument();
+ return term_op(left);
+}
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> CExpressionNode<SIZE>* COutputParser<SIZE>::sum_op(CExpressionNode<SIZE>* left)
+{
+ if (curr_token.second == TokenType::PLUS_OPER || curr_token.second == TokenType::STRICT_MINUS_OPER || curr_token.second == TokenType::COUNTER_MINUS_OPER)
+ {
+ CExpressionNode<SIZE>* res = nullptr;
+ if (curr_token.second == TokenType::PLUS_OPER)
+ res = new CUnionNode<SIZE>;
+ else if (curr_token.second == TokenType::STRICT_MINUS_OPER)
+ res = new CKmersSubtractionNode<SIZE>;
+ else
+ res = new CCountersSubtractionNode<SIZE>;
+ res->AddLeftChild(left);
+ nextToken();
+ auto right = term();
+ res->AddRightChild(right);
+ return sum_op(res);
+ }
+ return left;
+}
+
+/*****************************************************************************************************************************/
+template<unsigned SIZE> CExpressionNode<SIZE>* COutputParser<SIZE>::expr()
+{
+ auto left = term();
+ return sum_op(left);
+}
+
+
+
+#endif
+
+// ***** EOF
\ No newline at end of file
diff --git a/kmc_tools/parameters_parser.cpp b/kmc_tools/parameters_parser.cpp
new file mode 100644
index 0000000..9131033
--- /dev/null
+++ b/kmc_tools/parameters_parser.cpp
@@ -0,0 +1,536 @@
+/*
+ This file is a part of KMC software distributed under GNU GPL 3 licence.
+ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+ Authors: Marek Kokot
+
+ Version: 2.3.0
+ Date : 2015-08-21
+*/
+
+#include "stdafx.h"
+#include "parameters_parser.h"
+#include <iostream>
+using namespace std;
+
+
+uint32 CParametersParser::replace_zero(uint32 val, const char* param_name, uint32 value_to_set_if_zero)
+{
+ if (val == 0)
+ {
+ cout << "Warning: min value for " << param_name << " is " << value_to_set_if_zero << ". Your value will be converted to " << value_to_set_if_zero << "\n";
+ return value_to_set_if_zero;
+ }
+ return val;
+}
+
+
+void CParametersParser::parse_int_or_float(bool& force_float, bool& force_int, float& float_value, uint32& int_val, const char* param_name)
+{
+ if (strchr(argv[pos] + 3, '.'))
+ {
+ float_value = (float)atof(argv[pos++] + 3);
+ if (float_value > 1.0f || float_value < 0.0f)
+ {
+ cout << " Error: wrong value for fastq input parameter: "<< param_name <<"\n";
+ exit(1);
+ }
+ if (force_int)
+ {
+ cout << "Error: both -ci, -cx must be specified as real number [0;1] or as integer \n";
+ exit(1);
+ }
+ force_float = true;
+ config.filtering_params.use_float_value = true;
+ }
+ else
+ {
+ int_val = atoi(argv[pos++] + 3);
+ if (force_float)
+ {
+ cout << "Error: both -ci, -cx must be specified as real number [0;1] or as integer \n";
+ exit(1);
+ }
+ force_int = true;
+ config.filtering_params.use_float_value = false;
+ }
+}
+
+void CParametersParser::parse_global_params()
+{
+ //defaults
+ config.avaiable_threads = thread::hardware_concurrency();
+
+ //override defaults if specified
+ for( ; pos < argc && argv[pos][0] == '-' ; ++pos)
+ {
+ if (strncmp(argv[pos], "-t", 2) == 0)
+ {
+ config.avaiable_threads = atoi(argv[pos] + 2);
+ continue;
+ }
+ if (argv[pos][1] == 'v')
+ {
+ config.verbose = true;
+ continue;
+ }
+ if (strncmp(argv[pos], "-hp", 3) == 0)
+ {
+ config.percent_progress.Hide();
+ continue;
+ }
+ }
+}
+
+void CParametersParser::read_input_fastq_desc()
+{
+ if (pos >= argc)
+ {
+ cout << "Error: Input fastq files(s) missed\n";
+ exit(1);
+ }
+ if (strncmp(argv[pos], "-", 1) == 0)
+ {
+ cout << "Error: Input fastq file(s) required, but " << argv[pos] << " found\n";
+ exit(1);
+ }
+ string input_file_name = argv[pos++];
+ if (input_file_name[0] != '@')
+ config.filtering_params.input_srcs.push_back(input_file_name);
+ else
+ {
+ ifstream in(input_file_name.c_str() + 1);
+ if (!in.good())
+ {
+ cout << "Error: No " << input_file_name.c_str() + 1 << " file\n";
+ exit(1);
+ }
+ string s;
+ while (getline(in, s))
+ {
+ if (s != "")
+ config.filtering_params.input_srcs.push_back(s);
+ }
+ in.close();
+ }
+
+ bool force_float = false;
+ bool force_int = false;
+
+ for (int i = 0; i < 3 && pos < argc; ++i)
+ {
+ if(argv[pos][0] != '-')
+ break;
+ if (strncmp(argv[pos], "-ci", 3) == 0)
+ {
+ parse_int_or_float(force_float, force_int, config.filtering_params.f_min_kmers, config.filtering_params.n_min_kmers, "-ci");
+ }
+ else if (strncmp(argv[pos], "-cx", 3) == 0)
+ {
+ parse_int_or_float(force_float, force_int, config.filtering_params.f_max_kmers, config.filtering_params.n_max_kmers, "-cx");
+ }
+ else if (strncmp(argv[pos], "-f", 2) == 0)
+ {
+ switch (argv[pos++][2])
+ {
+ case 'a':
+ config.filtering_params.input_file_type = CFilteringParams::file_type::fasta;
+ break;
+ case 'q':
+ config.filtering_params.input_file_type = CFilteringParams::file_type::fastq;
+ break;
+ default:
+ cout << "Error: unknow parameter " << argv[pos - 1] << "\n";
+ exit(1);
+ break;
+ }
+ }
+ }
+ config.filtering_params.output_file_type = config.filtering_params.input_file_type;
+}
+
+
+void CParametersParser::read_output_fastq_desc()
+{
+ if (pos >= argc)
+ {
+ cout << "Error: Output fastq source missed\n";
+ exit(1);
+ }
+ if (strncmp(argv[pos], "-", 1) == 0)
+ {
+ cout << "Error: Output fastq source required, but " << argv[pos] << "found\n";
+ exit(1);
+ }
+ config.filtering_params.output_src = argv[pos++];
+
+ while (pos < argc && argv[pos][0] == '-')
+ {
+ if (strncmp(argv[pos], "-f", 2) == 0)
+ {
+ switch (argv[pos][2])
+ {
+ case 'q':
+ config.filtering_params.output_file_type = CFilteringParams::file_type::fastq;
+ break;
+ case 'a':
+ config.filtering_params.output_file_type = CFilteringParams::file_type::fasta;
+ break;
+ default:
+ cout << "Error: unknown parameter " << argv[pos] << "\n";
+ exit(1);
+ break;
+ }
+ if (config.filtering_params.input_file_type == CFilteringParams::file_type::fasta && config.filtering_params.output_file_type == CFilteringParams::file_type::fastq)
+ {
+ cout << "Error: cannot set -fq for output when -fa is set for input\n";
+ exit(1);
+ }
+ }
+ else
+ {
+ cout << "Error: Unknown parameter: " << argv[pos] << "\n";
+ exit(1);
+ }
+ ++pos;
+ }
+}
+
+
+void CParametersParser::read_dump_params()
+{
+ while (pos < argc && argv[pos][0] == '-')
+ {
+ if (strncmp(argv[pos], "-s", 2) == 0)
+ {
+ config.dump_params.sorted_output = true;
+ }
+ else
+ {
+ cout << "Warning: Unknow parameter for dump operation: " << argv[pos] << "\n";
+ }
+ ++pos;
+ }
+
+}
+
+void CParametersParser::read_input_desc()
+{
+ if (pos >= argc)
+ {
+ cout << "Error: Input database source missed\n";
+ exit(1);
+ }
+ if (strncmp(argv[pos], "-", 1) == 0)
+ {
+ cout << "Error: Input database source required, but " << argv[pos] << "found\n";
+ exit(1);
+ }
+ CInputDesc desc(argv[pos++]);
+ config.input_desc.push_back(desc);
+ for (int i = 0; i < 2 && pos < argc; ++i)
+ {
+ if (strncmp(argv[pos], "-", 1) != 0)
+ break;
+ if (strncmp(argv[pos], "-ci", 3) == 0)
+ {
+ config.input_desc.back().cutoff_min = replace_zero(atoi(argv[pos++] + 3), "-ci", 1);
+ }
+ else if (strncmp(argv[pos], "-cx", 3) == 0)
+ {
+ config.input_desc.back().cutoff_max = replace_zero(atoi(argv[pos++] + 3), "-cx", 1);
+ }
+ else
+ {
+ cout << "Error: Unknow parameter: " << argv[pos];
+ exit(1);
+ }
+ }
+}
+
+void CParametersParser::read_output_desc()
+{
+ if (pos >= argc)
+ {
+ cout << "Error: Output database source missed\n";
+ exit(1);
+ }
+ if (strncmp(argv[pos], "-", 1) == 0)
+ {
+ cout << "Error: Output database source required, but " << argv[pos] << "found\n";
+ exit(1);
+ }
+ config.output_desc.file_src = argv[pos++];
+ for (int i = 0; i < 2 && pos < argc; ++i)
+ {
+ if (strncmp(argv[pos], "-", 1) != 0)
+ break;
+ if (strncmp(argv[pos], "-ci", 3) == 0)
+ {
+ config.output_desc.cutoff_min = replace_zero(atoi(argv[pos++] + 3), "-ci", 1);
+ }
+ else if (strncmp(argv[pos], "-cx", 3) == 0)
+ {
+ config.output_desc.cutoff_max = replace_zero(atoi(argv[pos++] + 3), "-cx", 1);
+ }
+ else if (strncmp(argv[pos], "-cs", 3) == 0)
+ {
+ config.output_desc.counter_max = replace_zero(atoi(argv[pos++] + 3), "-cs", 1);
+ }
+ else
+ {
+ cout << "Error: Unknow parameter: " << argv[pos];
+ exit(1);
+ }
+ }
+}
+
+void CParametersParser::Usage()
+{
+ CUsageDisplayerFactory disp(CConfig::GetInstance().mode);
+ disp.GetUsageDisplayer().Display();
+}
+
+CParametersParser::CParametersParser(int argc, char** argv) :argc(argc), argv(argv), config(CConfig::GetInstance())
+{
+ pos = 0;
+ if (argc < 2)
+ {
+ Usage();
+ exit(1);
+ }
+}
+
+void CParametersParser::Parse()
+{
+ pos = 1;
+ parse_global_params();
+ if (strcmp(argv[pos], "intersect") == 0)
+ {
+ config.mode = CConfig::Mode::INTERSECTION;
+ }
+ else if (strcmp(argv[pos], "kmers_subtract") == 0)
+ {
+ config.mode = CConfig::Mode::KMERS_SUBTRACT;
+ }
+ else if (strcmp(argv[pos], "counters_subtract") == 0)
+ {
+ config.mode = CConfig::Mode::COUNTERS_SUBTRACT;
+ }
+ else if (strcmp(argv[pos], "union") == 0)
+ {
+ config.mode = CConfig::Mode::UNION;
+ }
+ else if (strcmp(argv[pos], "complex") == 0)
+ {
+ config.mode = CConfig::Mode::COMPLEX;
+ }
+ else if (strcmp(argv[pos], "sort") == 0)
+ {
+ config.mode = CConfig::Mode::SORT;
+ }
+ else if (strcmp(argv[pos], "reduce") == 0)
+ {
+ config.mode = CConfig::Mode::REDUCE;
+ }
+ else if (strcmp(argv[pos], "compact") == 0)
+ {
+ config.mode = CConfig::Mode::COMPACT;
+ }
+ else if (strcmp(argv[pos], "histogram") == 0)
+ {
+ config.mode = CConfig::Mode::HISTOGRAM;
+ }
+ else if (strcmp(argv[pos], "dump") == 0)
+ {
+ config.mode = CConfig::Mode::DUMP;
+ }
+ else if (strcmp(argv[pos], "compare") == 0)
+ {
+ config.mode = CConfig::Mode::COMPARE;
+ }
+ else if (strcmp(argv[pos], "filter") == 0)
+ {
+ config.mode = CConfig::Mode::FILTER;
+ }
+ else
+ {
+ cout << "Error: Unknow mode: " << argv[pos] << "\n";
+ Usage();
+ exit(1);
+ }
+
+ if (argc == 2)
+ {
+ Usage();
+ exit(1);
+ }
+
+ pos++;
+ if (config.mode == CConfig::Mode::INTERSECTION || config.mode == CConfig::Mode::KMERS_SUBTRACT || config.mode == CConfig::Mode::UNION || config.mode == CConfig::Mode::COUNTERS_SUBTRACT)
+ {
+ read_input_desc(); //first input
+ read_input_desc(); //second input
+ read_output_desc(); //output
+ }
+ else if (config.mode == CConfig::Mode::FILTER)
+ {
+ read_input_desc(); //kmc db
+ read_input_fastq_desc(); //fastq input
+ read_output_fastq_desc();
+ }
+ else if (config.mode == CConfig::Mode::COMPLEX)
+ {
+ if (strncmp(argv[2], "-", 1) == 0)
+ {
+ cout << "Error: operations description file expected but " << argv[2] << " found\n";
+ exit(1);
+ }
+ complex_parser = make_unique<CParser>(argv[pos]);
+ complex_parser->ParseInputs();
+ }
+ else if (config.mode == CConfig::Mode::DUMP)
+ {
+ read_dump_params();
+ read_input_desc();
+ read_output_desc();
+
+ }
+ else if (config.mode == CConfig::Mode::SORT || config.mode == CConfig::Mode::HISTOGRAM || config.mode == CConfig::Mode::REDUCE || config.mode == CConfig::Mode::COMPACT)
+ {
+ read_input_desc();
+ read_output_desc();
+ if (config.mode == CConfig::Mode::COMPACT)
+ {
+ if (config.output_desc.counter_max)
+ cout << "Warning: -cs can not be specified for compact operation, value specified will be ignored\n";
+ config.output_desc.counter_max = 1;
+ }
+ }
+ else if (config.mode == CConfig::Mode::COMPARE)
+ {
+ read_input_desc();
+ read_input_desc();
+ }
+}
+
+bool CParametersParser::validate_input_dbs()
+{
+ config.headers.push_back(CKMC_header(config.input_desc.front().file_src));
+
+ uint32 kmer_len = config.headers.front().kmer_len;
+ uint32 mode = config.headers.front().mode;
+ if (mode == 1)
+ {
+ cout << "Error: quality counters are not supported in kmc tools\n";
+ return false;
+ }
+ for (uint32 i = 1; i < config.input_desc.size(); ++i)
+ {
+ config.headers.push_back(CKMC_header(config.input_desc[i].file_src));
+ CKMC_header& h = config.headers.back();
+ if (h.mode != mode)
+ {
+ cout << "Error: quality/direct based counters conflict!\n";
+ return false;
+ }
+ if (h.kmer_len != kmer_len)
+ {
+ cout << "Database " << config.input_desc.front().file_src << " contains " << kmer_len << "-mers, but database " << config.input_desc[i].file_src << " contains " << h.kmer_len << "-mers\n";
+ return false;
+ }
+ }
+ config.kmer_len = kmer_len;
+
+
+ //update cutoff_min and coutoff_max if it was not set with parameters
+ for (uint32 i = 0; i < config.input_desc.size(); ++i)
+ {
+ if (config.input_desc[i].cutoff_min == 0)
+ config.input_desc[i].cutoff_min = config.headers[i].min_count;
+ if (config.input_desc[i].cutoff_max == 0)
+ config.input_desc[i].cutoff_max = config.headers[i].max_count;
+ }
+
+ //update output description if it was not set with parameters
+ if (config.output_desc.cutoff_min == 0)
+ {
+ uint32 min_cutoff_min = config.input_desc.front().cutoff_min;
+ for (uint32 i = 0; i < config.input_desc.size(); ++i)
+ {
+ if (config.input_desc[i].cutoff_min < min_cutoff_min)
+ min_cutoff_min = config.input_desc[i].cutoff_min;
+ }
+ config.output_desc.cutoff_min = min_cutoff_min;
+ if (config.verbose)
+ cout << "-ci was not specified for output. It will be set to " << min_cutoff_min << "\n";
+ }
+
+ if (config.output_desc.cutoff_max == 0)
+ {
+ if (config.mode == CConfig::Mode::HISTOGRAM) //for histogram default value differs
+ {
+ config.output_desc.cutoff_max = MIN(config.headers.front().max_count, MIN(HISTOGRAM_MAX_COUNTER_DEFAULT, (uint32)((1ull << (8 * config.headers.front().counter_size)) - 1)));
+ }
+ else
+ {
+ uint32 max_cutoff_max = config.input_desc.front().cutoff_max;
+ for (uint32 i = 0; i < config.input_desc.size(); ++i)
+ {
+ if (config.input_desc[i].cutoff_max > max_cutoff_max)
+ max_cutoff_max = config.input_desc[i].cutoff_max;
+ }
+ config.output_desc.cutoff_max = max_cutoff_max;
+ }
+
+ if (config.verbose)
+ cout << "-cx was not specified for output. It will be set to " << config.output_desc.cutoff_max << "\n";
+ }
+ if (config.output_desc.counter_max == 0)
+ {
+ uint32 max_counter_max = config.headers.front().counter_size;
+ for (uint32 i = 0; i < config.headers.size(); ++i)
+ {
+ if (config.headers[i].counter_size> max_counter_max)
+ max_counter_max = config.headers[i].counter_size;
+ }
+
+ max_counter_max = (uint32)((1ull << (max_counter_max << 3)) - 1);
+ config.output_desc.counter_max = max_counter_max;
+ if (config.verbose)
+ cout << "-cs was not specified for output. It will be set to " << max_counter_max << "\n";
+ }
+ return true;
+}
+
+void CParametersParser::SetThreads()
+{
+ uint32 threads_left = config.avaiable_threads;
+ //threads distribution: as many as possible for kmc2 database input, 1 thread for main thread which make operations calculation
+ vector<reference_wrapper<CInputDesc>> kmc2_desc;
+
+ if (!config.Is1ArgOper())
+ threads_left = MAX(1, threads_left - 1);
+
+ for (uint32 i = 0; i < config.headers.size(); ++i)
+ {
+ if (config.headers[i].IsKMC2())
+ {
+ kmc2_desc.push_back(ref(config.input_desc[i]));
+ }
+ }
+ if (kmc2_desc.size())
+ {
+ uint32 per_signle_kmc2_input = MAX(1, (uint32)(threads_left / kmc2_desc.size()));
+ uint32 per_last_kmc2_input = MAX(1, (uint32)((threads_left + kmc2_desc.size() - 1) / kmc2_desc.size()));
+
+ for (uint32 i = 0; i < kmc2_desc.size() - 1; ++i)
+ kmc2_desc[i].get().threads = per_signle_kmc2_input;
+
+ kmc2_desc.back().get().threads = per_last_kmc2_input;
+ }
+}
+
+
+
+// ***** EOF
\ No newline at end of file
diff --git a/kmc_tools/parameters_parser.h b/kmc_tools/parameters_parser.h
new file mode 100644
index 0000000..a2b2d4e
--- /dev/null
+++ b/kmc_tools/parameters_parser.h
@@ -0,0 +1,111 @@
+/*
+ This file is a part of KMC software distributed under GNU GPL 3 licence.
+ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+ Authors: Marek Kokot
+
+ Version: 2.3.0
+ Date : 2015-08-21
+*/
+
+#ifndef _PARAMETERS_PARSER_H
+#define _PARAMETERS_PARSER_H
+
+#include "defs.h"
+#include "parser.h"
+#include <memory>
+class CParametersParser
+{
+ std::unique_ptr<CParser> complex_parser;
+ int argc;
+ char** argv;
+ int pos;
+ CConfig& config;
+
+ uint32 replace_zero(uint32 val, const char* param_name, uint32 value_to_set_if_zero);
+ void parse_int_or_float(bool& force_float, bool& force_int, float& float_value, uint32& int_val, const char* param_name);
+ void parse_global_params();
+ void read_input_fastq_desc();
+ void read_output_fastq_desc();
+ void read_input_desc();
+ void read_dump_params();
+ void read_output_desc();
+public:
+
+ CParametersParser(int argc, char** argv);
+ void Usage();
+
+ template<unsigned SIZE>
+ CExpressionNode<SIZE>* GetExpressionRoot();
+
+ void Parse();
+ bool validate_input_dbs();
+ void SetThreads();
+
+};
+
+template<unsigned SIZE>
+CExpressionNode<SIZE>* CParametersParser::GetExpressionRoot()
+{
+ if (config.mode == CConfig::Mode::INTERSECTION || config.mode == CConfig::Mode::KMERS_SUBTRACT || config.mode == CConfig::Mode::UNION || config.mode == CConfig::Mode::COUNTERS_SUBTRACT)
+ {
+ CExpressionNode<SIZE>* left = new CInputNode<SIZE>(0);
+ CExpressionNode<SIZE>* right = new CInputNode<SIZE>(1);
+ CExpressionNode<SIZE>* expression_root = nullptr;
+ switch (config.mode)
+ {
+ case CConfig::Mode::INTERSECTION:
+ expression_root = new CIntersectionNode<SIZE>;
+ break;
+ case CConfig::Mode::KMERS_SUBTRACT:
+ expression_root = new CKmersSubtractionNode<SIZE>;
+ break;
+ case CConfig::Mode::UNION:
+ expression_root = new CUnionNode<SIZE>;
+ break;
+ case CConfig::Mode::COUNTERS_SUBTRACT:
+ expression_root = new CCountersSubtractionNode<SIZE>;
+ break;
+ default:
+ std::cout << "Error: unknow operation\n";
+ exit(1);
+ }
+ expression_root->AddLeftChild(left);
+ expression_root->AddRightChild(right);
+ return expression_root;
+ }
+ else if (config.mode == CConfig::Mode::COMPLEX)
+ {
+ auto result = complex_parser->ParseOutput<SIZE>();
+ return result;
+ }
+ else if (config.mode == CConfig::Mode::SORT)
+ {
+ if (!config.headers.front().IsKMC2())
+ {
+ std::cout << "This database contains sorted k-mers already!";
+ exit(1);
+ }
+ return new CInputNode<SIZE>(0);
+ }
+ else if (config.mode == CConfig::Mode::REDUCE)
+ {
+ return new CInputNode<SIZE>(0);
+ }
+ else if (config.mode == CConfig::Mode::COMPACT)
+ {
+ return new CInputNode<SIZE>(0);
+ }
+ else //should never be here
+ {
+ std::cout << "Error: unknow operation\n";
+#ifdef ENABLE_DEBUG
+ std::cout << __FUNCTION__ << " line: " << __LINE__ << "\n";
+#endif
+ exit(1);
+ }
+ }
+#endif
+
+
+// ***** EOF
\ No newline at end of file
diff --git a/kmc_tools/parser.cpp b/kmc_tools/parser.cpp
new file mode 100644
index 0000000..dfbb03f
--- /dev/null
+++ b/kmc_tools/parser.cpp
@@ -0,0 +1,184 @@
+/*
+ This file is a part of KMC software distributed under GNU GPL 3 licence.
+ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+ Authors: Marek Kokot
+
+ Version: 2.3.0
+ Date : 2015-08-21
+*/
+
+#include "stdafx.h"
+#include "parser.h"
+#include "tokenizer.h"
+#include "output_parser.h"
+#include "config.h"
+
+/*****************************************************************************************************************************/
+/******************************************************** CONSTRUCTOR ********************************************************/
+/*****************************************************************************************************************************/
+
+CParser::CParser(const std::string& src):
+ config(CConfig::GetInstance())
+{
+ line_no = 0;
+ file.open(src);
+ if (!file.is_open())
+ {
+ std::cout << "Cannot open file: " << src << "\n";
+ exit(1);
+ }
+ //input_line_pattern = "\\s*(\\w*)\\s*=\\s*(.*)$";
+ input_line_pattern = "^\\s*([\\w-+]*)\\s*=\\s*(.*)$"; //TODO: consider valid file name
+ empty_line_pattern = "^\\s*$";
+}
+
+/*****************************************************************************************************************************/
+/********************************************************** PUBLIC ***********************************************************/
+/*****************************************************************************************************************************/
+
+void CParser::ParseInputs()
+{
+ std::string line;
+ while (true)
+ {
+ if (!nextLine(line))
+ {
+ std::cout << "Error: 'INPUT:' missing\n";
+ exit(1);
+ }
+ if (line.find("INPUT:") != std::string::npos)
+ break;
+ }
+
+ if (!nextLine(line) || line.find("OUTPUT:") != std::string::npos)
+ {
+ std::cout << "Error: None input was defined\n";
+ exit(1);
+ }
+
+ while (true)
+ {
+ parseInputLine(line);
+ if (!nextLine(line))
+ {
+ std::cout << "Error: 'OUTPUT:' missing\n";
+ exit(1);
+ }
+ if (line.find("OUTPUT:") != std::string::npos)
+ break;
+ }
+}
+
+
+/*****************************************************************************************************************************/
+/********************************************************** PRIVATE **********************************************************/
+/*****************************************************************************************************************************/
+
+/*****************************************************************************************************************************/
+void CParser::parseInputLine(const std::string& line)
+{
+ std::smatch match;
+ if (std::regex_search(line, match, input_line_pattern))
+ {
+#ifdef ENABLE_DEBUG
+ std::cout << "\ninput name: " << match[1];
+ std::cout << "\nafter = " << match[2];
+#endif
+ if (input.find(match[1]) != input.end())
+ {
+ std::cout << "Error: Name redefinition(" << match[1] << ")" << " line: " << line_no << "\n";
+ exit(1);
+ }
+ else
+ {
+ std::string file_name;
+ std::istringstream stream(match[2]);
+
+ CInputDesc desc;
+
+ if (!(stream >> desc.file_src))
+ {
+ std::cout << "Error: file name for " << match[1] << " was not specified, line: "<< line_no <<"\n";
+ exit(1);
+ }
+ std::string tmp;
+ while (stream >> tmp)
+ {
+ if (strncmp(tmp.c_str(), "-ci", 3) == 0)
+ {
+ desc.cutoff_min = atoi(tmp.c_str() + 3);
+ continue;
+ }
+ else if (strncmp(tmp.c_str(), "-cx", 3) == 0)
+ {
+ desc.cutoff_max = atoi(tmp.c_str() + 3);
+ continue;
+ }
+ std::cout << "Error: Unknow parameter " << tmp << " for variable " << match[1] << ", line: "<< line_no <<"\n";
+ exit(1);
+ }
+
+ config.input_desc.push_back(desc);
+ input[match[1]] = (uint32)(config.input_desc.size() - 1);
+ }
+ }
+ else
+ {
+ std::cout << "Error: wrong line format, line: " << line_no << "\n";
+ exit(1);
+ }
+}
+
+/*****************************************************************************************************************************/
+void CParser::parseOtuputParamsLine()
+{
+ std::string line;
+
+ if (!nextLine(line))
+ {
+ std::cout << "Warning: OUTPUT_PARAMS exists, but no parameters are defined\n";
+ }
+ else
+ {
+ std::istringstream stream(line);
+ std::string tmp;
+ while (stream >> tmp)
+ {
+ if (strncmp(tmp.c_str(), "-ci", 3) == 0)
+ {
+ config.output_desc.cutoff_min = atoi(tmp.c_str() + 3);
+ continue;
+ }
+ else if (strncmp(tmp.c_str(), "-cx", 3) == 0)
+ {
+ config.output_desc.cutoff_max = atoi(tmp.c_str() + 3);
+ continue;
+ }
+ else if ((strncmp(tmp.c_str(), "-cs", 3) == 0))
+ {
+ config.output_desc.counter_max = atoi(tmp.c_str() + 3);
+ continue;
+ }
+ std::cout << "Error: Unknow parameter " << tmp << " for variable " << tmp << ", line: " << line_no << "\n";
+ exit(1);
+ }
+ }
+}
+
+/*****************************************************************************************************************************/
+bool CParser::nextLine(std::string& line)
+{
+ while (true)
+ {
+ if (file.eof())
+ return false;
+ std::getline(file, line);
+ ++line_no;
+ std::smatch match;
+ if (!std::regex_search(line, match, empty_line_pattern))
+ return true;
+ }
+}
+
+// ***** EOF
\ No newline at end of file
diff --git a/kmc_tools/parser.h b/kmc_tools/parser.h
new file mode 100644
index 0000000..f4e1b94
--- /dev/null
+++ b/kmc_tools/parser.h
@@ -0,0 +1,107 @@
+/*
+ This file is a part of KMC software distributed under GNU GPL 3 licence.
+ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+ Authors: Marek Kokot
+
+ Version: 2.3.0
+ Date : 2015-08-21
+*/
+
+#ifndef _PARSER_H
+#define _PARSER_H
+#include "defs.h"
+#include "expression_node.h"
+#include "tokenizer.h"
+#include "output_parser.h"
+
+#include <iostream>
+#include <fstream>
+#include <regex>
+#include <map>
+#include <list>
+
+
+//************************************************************************************************************
+// CParser - parser for complex operations
+//************************************************************************************************************
+class CParser
+{
+ std::ifstream file;
+ uint32 line_no;
+ std::regex input_line_pattern;
+ std::regex empty_line_pattern;
+ std::map<std::string, uint32> input;
+ void parseInputLine(const std::string& line);
+
+ template<unsigned SIZE>
+ CExpressionNode<SIZE>* parseOutputLine(const std::string& line);
+ void parseOtuputParamsLine();
+ bool nextLine(std::string& line);
+ CConfig& config;
+
+public:
+ CParser(const std::string& src);
+ void ParseInputs();
+
+ template<unsigned SIZE>
+ CExpressionNode<SIZE>* ParseOutput();
+
+};
+
+//************************************************************************************************************
+template<unsigned SIZE> CExpressionNode<SIZE>* CParser::ParseOutput()
+{
+ std::string line;
+ if (!nextLine(line) || line.find("OUTPUT_PARAMS:") != std::string::npos)
+ {
+ std::cout << "Error: None output was defined\n";
+ exit(1);
+ }
+
+ auto result = parseOutputLine<SIZE>(line);
+
+ while (nextLine(line))
+ {
+ if (line.find("OUTPUT_PARAMS:") != std::string::npos)
+ {
+ parseOtuputParamsLine();
+ break;
+ }
+ }
+
+ return result;
+}
+
+//************************************************************************************************************
+template<unsigned SIZE> CExpressionNode<SIZE>* CParser::parseOutputLine(const std::string& line)
+{
+ std::smatch match;
+ if (std::regex_search(line, match, input_line_pattern))
+ {
+#ifdef ENABLE_DEBUG
+ std::cout << "out file name " << match[1] << "\n";
+ std::cout << "rest of output " << match[2] << "\n";
+
+ std::cout << "Tokenize resf of output\n";
+#endif
+ config.output_desc.file_src = match[1];
+
+ CTokenizer tokenizer;
+
+ std::list<Token> tokens;
+ tokenizer.Tokenize(match[2], tokens);
+
+ COutputParser<SIZE> out_parser(tokens, input);
+ return out_parser.Parse();
+
+ }
+ else
+ {
+ std::cout << "Error: wrong line format, line: " << line_no << "\n";
+ exit(1);
+ }
+}
+#endif
+
+// ***** EOF
\ No newline at end of file
diff --git a/kmc_tools/percent_progress.cpp b/kmc_tools/percent_progress.cpp
new file mode 100644
index 0000000..922afd5
--- /dev/null
+++ b/kmc_tools/percent_progress.cpp
@@ -0,0 +1,93 @@
+/*
+ This file is a part of KMC software distributed under GNU GPL 3 licence.
+ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+ Authors: Marek Kokot
+
+ Version: 2.3.0
+ Date : 2015-08-21
+*/
+
+#include "stdafx.h"
+#include "percent_progress.h"
+#include <iostream>
+#include <string>
+using namespace std;
+
+
+/*****************************************************************************************************************************/
+/********************************************************** PUBLIC ***********************************************************/
+/*****************************************************************************************************************************/
+
+/*****************************************************************************************************************************/
+uint32 CPercentProgress::RegisterItem(const std::string& name, uint64 max_value)
+{
+ items.emplace_back(name, max_value);
+ return static_cast<uint32>(items.size() - 1);
+}
+
+/*****************************************************************************************************************************/
+uint32 CPercentProgress::RegisterItem(uint64 max_value)
+{
+ items.emplace_back("in" + std::to_string(items.size() + 1), max_value);
+ display();
+ return static_cast<uint32>(items.size() - 1);
+}
+
+/*****************************************************************************************************************************/
+void CPercentProgress::UpdateItem(uint32 id)
+{
+ --items[id].to_next_update;
+ if (!items[id].to_next_update)
+ {
+ items[id].to_next_update = items[id].to_next_update_pattern;
+ UpdateItem(id, items[id].to_next_update_pattern);
+ }
+}
+
+/*****************************************************************************************************************************/
+void CPercentProgress::UpdateItem(uint32 id, uint32 offset)
+{
+ items[id].cur_val += offset;
+ uint32 prev = items[id].cur_percent;
+ if (items[id].max_val)
+ items[id].cur_percent = static_cast<uint32>((items[id].cur_val * 100) / items[id].max_val);
+ else
+ items[id].cur_percent = 100;
+ if (prev != items[id].cur_percent)
+ display();
+}
+
+/*****************************************************************************************************************************/
+void CPercentProgress::Complete(uint32 id)
+{
+ if (items[id].cur_percent != 100)
+ {
+ items[id].cur_percent = 100;
+ display();
+ }
+}
+
+/*****************************************************************************************************************************/
+/********************************************************** PRIVATE **********************************************************/
+/*****************************************************************************************************************************/
+
+/*****************************************************************************************************************************/
+void CPercentProgress::display()
+{
+ if (hide_progress)
+ return;
+ std::cout << "\r";
+ for (auto& item : items)
+ std::cout << item.name << ": " << item.cur_percent << "% ";
+ std::cout.flush();
+}
+
+/*****************************************************************************************************************************/
+CPercentProgress::CDisplayItem::CDisplayItem(const std::string name, uint64 max_val) : name(name), max_val(max_val)
+{
+ to_next_update_pattern = (uint32)MAX(1, max_val / 100);
+ to_next_update = to_next_update_pattern;
+}
+
+// ***** EOF
\ No newline at end of file
diff --git a/kmc_tools/percent_progress.h b/kmc_tools/percent_progress.h
new file mode 100644
index 0000000..a8683e9
--- /dev/null
+++ b/kmc_tools/percent_progress.h
@@ -0,0 +1,48 @@
+/*
+ This file is a part of KMC software distributed under GNU GPL 3 licence.
+ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+ Authors: Marek Kokot
+
+ Version: 2.3.0
+ Date : 2015-08-21
+*/
+
+#ifndef _PERCENT_PROGRESS_H
+#define _PERCENT_PROGRESS_H
+
+#include "defs.h"
+#include <vector>
+#include <string>
+//************************************************************************************************************
+// CPercentProgress - class to display progress of reading inputs
+//************************************************************************************************************
+class CPercentProgress
+{
+ bool hide_progress = false;
+ struct CDisplayItem
+ {
+ std::string name;
+ uint64 cur_val = 0;
+ uint64 max_val;
+ uint32 cur_percent = 0;
+
+ uint32 to_next_update;
+ uint32 to_next_update_pattern;
+ public:
+ CDisplayItem(const std::string name, uint64 max_val);
+ };
+ std::vector<CDisplayItem> items;
+ void display();
+public:
+ uint32 RegisterItem(const std::string& name, uint64 max_value);
+ uint32 RegisterItem(uint64 max_value);
+ void UpdateItem(uint32 id);
+ void Complete(uint32 id);
+ void UpdateItem(uint32 id, uint32 offset);
+ void Hide(){ hide_progress = true; }
+};
+
+#endif
+
+// ***** EOF
\ No newline at end of file
diff --git a/kmc_tools/queues.h b/kmc_tools/queues.h
new file mode 100644
index 0000000..2d38c18
--- /dev/null
+++ b/kmc_tools/queues.h
@@ -0,0 +1,393 @@
+/*
+ This file is a part of KMC software distributed under GNU GPL 3 licence.
+ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+ Authors: Marek Kokot
+
+ Version: 2.3.0
+ Date : 2015-08-21
+*/
+
+#ifndef _QUEUES_H_
+#define _QUEUES_H_
+
+#include "defs.h"
+#include "bundle.h"
+#include <mutex>
+#include <vector>
+#include <condition_variable>
+#include <list>
+#include <queue>
+
+
+
+class CSufWriteQueue
+{
+ uint32 buf_size;
+ uint32 max_inside;
+
+ using elem_t = std::pair<uchar*, uint32>;
+ std::list<elem_t> content;
+
+ mutable std::mutex mtx;
+ uint32 n_writers;
+ std::condition_variable cv_pop, cv_push;
+public:
+ void init(uint32 _buf_size, uint32 _max_inside)
+ {
+ buf_size = _buf_size;
+ max_inside = _max_inside;
+ n_writers = 1;
+ }
+
+ void push(uchar* &buf, uint32 size)
+ {
+ std::unique_lock<std::mutex> lck(mtx);
+ cv_push.wait(lck, [this]{return content.size() < max_inside; });
+
+ bool was_empty = content.empty();
+
+ content.push_back(std::make_pair(buf, size));
+
+ buf = new uchar[buf_size];
+
+ if (was_empty)
+ cv_pop.notify_all();
+ }
+
+ bool pop(uchar* &buf, uint32& size)
+ {
+ std::unique_lock<std::mutex> lck(mtx);
+ cv_pop.wait(lck, [this]{return !content.empty() || !n_writers; });
+ if (!n_writers && content.empty())
+ return false;
+
+ bool was_full = max_inside == content.size();
+
+ buf = content.front().first;
+ size = content.front().second;
+ content.pop_front();
+
+ if (was_full)
+ cv_push.notify_all();
+
+ return true;
+ }
+
+
+ void mark_completed()
+ {
+ std::lock_guard<std::mutex> lck(mtx);
+ --n_writers;
+ if (!n_writers)
+ cv_pop.notify_all();
+ }
+};
+
+
+template<unsigned SIZE> class CCircularQueue
+{
+ std::vector<CBundleData<SIZE>> buff;
+ bool full, is_completed;
+ int start, end;
+ mutable std::mutex mtx;
+
+ std::condition_variable cv_push;
+ std::condition_variable cv_pop;
+
+ bool forced_to_finish = false;
+
+public:
+ CCircularQueue(int size) : buff(size), full(false), is_completed(false), start(0), end(0)
+ {
+
+ }
+
+ bool push(CBundleData<SIZE>& bundle_data)
+ {
+ std::unique_lock<std::mutex> lck(mtx);
+ cv_push.wait(lck, [this]{return !full || forced_to_finish; });
+
+ if (forced_to_finish)
+ {
+ return false;
+ }
+
+ bool was_empty = start == end;
+
+ std::swap(buff[end], bundle_data);
+ bundle_data.Clear();
+ end = (end + 1) % buff.size();
+
+ if (end == start)
+ full = true;
+
+ if (was_empty)
+ cv_pop.notify_all();
+
+ return true;
+ }
+
+ bool pop(CBundleData<SIZE>& bundle_data)
+ {
+ std::unique_lock<std::mutex> lck(mtx);
+ cv_pop.wait(lck, [this]{ return start != end || full || is_completed || forced_to_finish; });
+
+ if (forced_to_finish)
+ return false;
+
+ if (is_completed && !full && start == end)
+ return false;
+
+ bool was_full = full;
+ std::swap(buff[start], bundle_data);
+ buff[start].Clear();
+ start = (start + 1) % buff.size();
+ full = false;
+ if (was_full)
+ cv_push.notify_all();
+ return true;
+ }
+
+ void mark_completed()
+ {
+ std::lock_guard<std::mutex> lck(mtx);
+ is_completed = true;
+ cv_pop.notify_all();
+ }
+
+ void force_finish()
+ {
+ std::lock_guard<std::mutex> lck(mtx);
+ forced_to_finish = true;
+ cv_pop.notify_all();
+ cv_push.notify_all();
+ }
+
+};
+
+class CInputFilesQueue {
+ typedef std::string elem_t;
+ typedef std::queue<elem_t, std::list<elem_t>> queue_t;
+
+ queue_t q;
+
+ mutable std::mutex mtx; // The mutex to synchronise on
+
+public:
+ CInputFilesQueue(const std::vector<std::string> &file_names) {
+ std::unique_lock<std::mutex> lck(mtx);
+
+ for (auto p = file_names.cbegin(); p != file_names.cend(); ++p)
+ q.push(*p);
+
+ };
+
+ bool pop(std::string &file_name) {
+ std::lock_guard<std::mutex> lck(mtx);
+
+ if (q.empty())
+ return false;
+
+ file_name = q.front();
+ q.pop();
+
+ return true;
+ }
+};
+
+class CMemoryPool {
+ int64 total_size;
+ int64 part_size;
+ int64 n_parts_total;
+ int64 n_parts_free;
+
+ uchar *buffer, *raw_buffer;
+ uint32 *stack;
+
+ mutable std::mutex mtx; // The mutex to synchronise on
+ std::condition_variable cv; // The condition to wait for
+
+public:
+ CMemoryPool(int64 _total_size, int64 _part_size) {
+ raw_buffer = NULL;
+ buffer = NULL;
+ stack = NULL;
+ prepare(_total_size, _part_size);
+ }
+ ~CMemoryPool() {
+ release();
+ }
+
+ void prepare(int64 _total_size, int64 _part_size) {
+ release();
+
+ n_parts_total = _total_size / _part_size;
+ part_size = (_part_size + 15) / 16 * 16; // to allow mapping pointer to int*
+ n_parts_free = n_parts_total;
+
+ total_size = n_parts_total * part_size;
+
+ raw_buffer = new uchar[total_size + 64];
+ buffer = raw_buffer;
+ while (((uint64)buffer) % 64)
+ buffer++;
+
+ stack = new uint32[n_parts_total];
+ for (uint32 i = 0; i < n_parts_total; ++i)
+ stack[i] = i;
+ }
+
+ void release(void) {
+ if (raw_buffer)
+ delete[] raw_buffer;
+ raw_buffer = NULL;
+ buffer = NULL;
+
+ if (stack)
+ delete[] stack;
+ stack = NULL;
+ }
+
+ // Allocate memory buffer - uchar*
+ void reserve(uchar* &part)
+ {
+ std::unique_lock<std::mutex> lck(mtx);
+ cv.wait(lck, [this]{return n_parts_free > 0; });
+
+ part = buffer + stack[--n_parts_free] * part_size;
+ }
+ // Allocate memory buffer - char*
+ void reserve(char* &part)
+ {
+ std::unique_lock<std::mutex> lck(mtx);
+ cv.wait(lck, [this]{return n_parts_free > 0; });
+
+ part = (char*)(buffer + stack[--n_parts_free] * part_size);
+ }
+ // Allocate memory buffer - uint32*
+ void reserve(uint32* &part)
+ {
+ std::unique_lock<std::mutex> lck(mtx);
+ cv.wait(lck, [this]{return n_parts_free > 0; });
+
+ part = (uint32*)(buffer + stack[--n_parts_free] * part_size);
+ }
+ // Allocate memory buffer - uint64*
+ void reserve(uint64* &part)
+ {
+ std::unique_lock<std::mutex> lck(mtx);
+ cv.wait(lck, [this]{return n_parts_free > 0; });
+
+ part = (uint64*)(buffer + stack[--n_parts_free] * part_size);
+ }
+ // Allocate memory buffer - double*
+ void reserve(double* &part)
+ {
+ std::unique_lock<std::mutex> lck(mtx);
+ cv.wait(lck, [this]{return n_parts_free > 0; });
+
+ part = (double*)(buffer + stack[--n_parts_free] * part_size);
+ }
+
+ // Deallocate memory buffer - uchar*
+ void free(uchar* part)
+ {
+ std::lock_guard<std::mutex> lck(mtx);
+
+ stack[n_parts_free++] = (uint32)((part - buffer) / part_size);
+
+ cv.notify_all();
+ }
+ // Deallocate memory buffer - char*
+ void free(char* part)
+ {
+ std::lock_guard<std::mutex> lck(mtx);
+
+ stack[n_parts_free++] = (uint32)(((uchar*)part - buffer) / part_size);
+ cv.notify_all();
+ }
+ // Deallocate memory buffer - uint32*
+ void free(uint32* part)
+ {
+ std::lock_guard<std::mutex> lck(mtx);
+
+ stack[n_parts_free++] = (uint32)((((uchar *)part) - buffer) / part_size);
+ cv.notify_all();
+ }
+ // Deallocate memory buffer - uint64*
+ void free(uint64* part)
+ {
+ std::lock_guard<std::mutex> lck(mtx);
+
+ stack[n_parts_free++] = (uint32)((((uchar *)part) - buffer) / part_size);
+ cv.notify_all();
+ }
+ // Deallocate memory buffer - double*
+ void free(double* part)
+ {
+ std::lock_guard<std::mutex> lck(mtx);
+
+ stack[n_parts_free++] = (uint32)((((uchar *)part) - buffer) / part_size);
+ cv.notify_all();
+ }
+};
+
+class CPartQueue {
+ typedef std::pair<uchar *, uint64> elem_t;
+ typedef std::queue<elem_t, std::list<elem_t>> queue_t;
+
+ queue_t q;
+ bool is_completed;
+ int n_readers;
+
+ mutable std::mutex mtx; // The mutex to synchronise on
+ std::condition_variable cv_queue_empty;
+
+public:
+ CPartQueue(int _n_readers) {
+ std::unique_lock<std::mutex> lck(mtx);
+ is_completed = false;
+ n_readers = _n_readers;
+ };
+ ~CPartQueue() {};
+
+ bool empty() {
+ std::lock_guard<std::mutex> lck(mtx);
+ return q.empty();
+ }
+ bool completed() {
+ std::lock_guard<std::mutex> lck(mtx);
+ return q.empty() && !n_readers;
+ }
+ void mark_completed() {
+ std::lock_guard<std::mutex> lck(mtx);
+ n_readers--;
+ if (!n_readers)
+ cv_queue_empty.notify_all();
+ }
+ void push(uchar *part, uint64 size) {
+ std::unique_lock<std::mutex> lck(mtx);
+
+ bool was_empty = q.empty();
+ q.push(std::make_pair(part, size));
+
+ if (was_empty)
+ cv_queue_empty.notify_all();
+ }
+ bool pop(uchar *&part, uint64 &size) {
+ std::unique_lock<std::mutex> lck(mtx);
+ cv_queue_empty.wait(lck, [this]{return !this->q.empty() || !this->n_readers; });
+
+ if (q.empty())
+ return false;
+
+ std::tie(part, size) = q.front();
+ q.pop();
+
+ return true;
+ }
+};
+
+#endif
+
diff --git a/kmc_tools/stdafx.cpp b/kmc_tools/stdafx.cpp
new file mode 100644
index 0000000..62830a2
--- /dev/null
+++ b/kmc_tools/stdafx.cpp
@@ -0,0 +1,8 @@
+// stdafx.cpp : source file that includes just the standard includes
+// kmc_tools.pch will be the pre-compiled header
+// stdafx.obj will contain the pre-compiled type information
+
+#include "stdafx.h"
+
+// TODO: reference any additional headers you need in STDAFX.H
+// and not in this file
diff --git a/kmc_tools/stdafx.h b/kmc_tools/stdafx.h
new file mode 100644
index 0000000..4777423
--- /dev/null
+++ b/kmc_tools/stdafx.h
@@ -0,0 +1,17 @@
+#ifdef WIN32
+// stdafx.h : include file for standard system include files,
+// or project specific include files that are used frequently, but
+// are changed infrequently
+//
+
+#pragma once
+
+#include "targetver.h"
+
+#include <stdio.h>
+#include <tchar.h>
+
+
+
+// TODO: reference additional headers your program requires here
+#endif
\ No newline at end of file
diff --git a/kmc_tools/targetver.h b/kmc_tools/targetver.h
new file mode 100644
index 0000000..87c0086
--- /dev/null
+++ b/kmc_tools/targetver.h
@@ -0,0 +1,8 @@
+#pragma once
+
+// Including SDKDDKVer.h defines the highest available Windows platform.
+
+// If you wish to build your application for a previous Windows platform, include WinSDKVer.h and
+// set the _WIN32_WINNT macro to the platform you wish to support before including SDKDDKVer.h.
+
+#include <SDKDDKVer.h>
diff --git a/kmc_tools/timer.h b/kmc_tools/timer.h
new file mode 100644
index 0000000..c8e42fc
--- /dev/null
+++ b/kmc_tools/timer.h
@@ -0,0 +1,33 @@
+/*
+ This file is a part of KMC software distributed under GNU GPL 3 licence.
+ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+ Authors: Marek Kokot
+
+ Version: 2.3.0
+ Date : 2015-08-21
+*/
+
+#ifndef _TIMER_H
+#define _TIMER_H
+
+#include <chrono>
+class CTimer
+{
+ using time_p = std::chrono::system_clock::time_point;
+ time_p _start, _end;
+public:
+ void start()
+ {
+ _start = std::chrono::high_resolution_clock::now();
+ }
+ double get_time()
+ {
+ auto time = std::chrono::high_resolution_clock::now() - _start;
+ return static_cast<double>(std::chrono::duration_cast<std::chrono::milliseconds>(time).count());
+ }
+};
+
+
+
+#endif
diff --git a/kmc_tools/tokenizer.cpp b/kmc_tools/tokenizer.cpp
new file mode 100644
index 0000000..ade79a1
--- /dev/null
+++ b/kmc_tools/tokenizer.cpp
@@ -0,0 +1,77 @@
+/*
+ This file is a part of KMC software distributed under GNU GPL 3 licence.
+ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+ Authors: Marek Kokot
+
+ Version: 2.3.0
+ Date : 2015-08-21
+*/
+
+#include "stdafx.h"
+#include "tokenizer.h"
+
+/*****************************************************************************************************************************/
+/******************************************************** CONSTRUCTOR ********************************************************/
+/*****************************************************************************************************************************/
+
+CTokenizer::CTokenizer()
+{
+ token_patterns.resize(7);
+ token_patterns[0] = std::make_pair("^(\\()", TokenType::PARENTHESIS_OPEN);
+ token_patterns[1] = std::make_pair("^(\\))", TokenType::PARENTHESIS_CLOSE);
+ token_patterns[2] = std::make_pair("^(\\-)", TokenType::STRICT_MINUS_OPER);
+ token_patterns[3] = std::make_pair("^(\\~)", TokenType::COUNTER_MINUS_OPER);
+ token_patterns[4] = std::make_pair("^(\\+)", TokenType::PLUS_OPER);
+ token_patterns[5] = std::make_pair("^(\\*)", TokenType::MUL_OPER);
+ token_patterns[6] = std::make_pair("^(\\w*)", TokenType::VARIABLE);
+}
+
+
+/*****************************************************************************************************************************/
+/********************************************************** PUBLIC ***********************************************************/
+/*****************************************************************************************************************************/
+
+void CTokenizer::Tokenize(const std::string& _expression, std::list<Token>& tokens)
+{
+ std::string expression = _expression;
+ std::smatch match;
+ leftTrimString(expression, 0);
+ while (!expression.empty())
+ {
+ bool valid_token = false;
+ for (const auto& pattern : token_patterns)
+ {
+ if (std::regex_search(expression, match, pattern.first))
+ {
+#ifdef ENABLE_DEBUG
+ std::cout << match[1];
+#endif
+ tokens.push_back(std::make_pair(match[1], pattern.second));
+ leftTrimString(expression, (int)match[1].length());
+ valid_token = true;
+ break;
+ }
+ }
+ if (!valid_token)
+ {
+ std::cout << "Error: wrong output format near : " << expression << "\n";
+ exit(1);
+ }
+ }
+}
+
+/*****************************************************************************************************************************/
+/********************************************************** PRIVATE **********************************************************/
+/*****************************************************************************************************************************/
+
+void CTokenizer::leftTrimString(std::string& str, int start_pos)
+{
+ static const std::string whitespace = " \t\r\n\v\f";
+ auto next_pos = str.find_first_not_of(whitespace, start_pos);
+ str.erase(0, next_pos);
+}
+
+
+
+// ***** EOF
\ No newline at end of file
diff --git a/kmc_tools/tokenizer.h b/kmc_tools/tokenizer.h
new file mode 100644
index 0000000..e596243
--- /dev/null
+++ b/kmc_tools/tokenizer.h
@@ -0,0 +1,39 @@
+/*
+ This file is a part of KMC software distributed under GNU GPL 3 licence.
+ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+ Authors: Marek Kokot
+
+ Version: 2.3.0
+ Date : 2015-08-21
+*/
+
+#ifndef _TOKENIZER_H
+#define _TOKENIZER_H
+
+#include "defs.h"
+#include <vector>
+#include <regex>
+#include <list>
+#include <iostream>
+
+enum class TokenType{ VARIABLE, PLUS_OPER, STRICT_MINUS_OPER, COUNTER_MINUS_OPER, MUL_OPER, PARENTHESIS_OPEN, PARENTHESIS_CLOSE, TERMINATOR };
+using Token = std::pair<std::string, TokenType>;
+
+//************************************************************************************************************
+// CTokenizer - Tokenizer for k-mers set operations
+//************************************************************************************************************
+class CTokenizer
+{
+public:
+ CTokenizer();
+ void Tokenize(const std::string& _expression, std::list<Token>& tokens);
+
+private:
+ std::vector<std::pair<std::regex, TokenType>> token_patterns;
+ void leftTrimString(std::string& str, int start_pos);
+};
+
+#endif
+
+// ***** EOF
\ No newline at end of file
diff --git a/kmer_counter.sln b/kmer_counter.sln
index dec003e..cf3fc43 100644
--- a/kmer_counter.sln
+++ b/kmer_counter.sln
@@ -9,6 +9,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "kmc_dump", "kmc_dump\kmc_du
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "kmc_dump_sample", "kmc_dump_sample\kmc_dump_sample.vcxproj", "{17823F37-86DE-4E58-B354-B84DA9EDA6A1}"
EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "kmc_tools", "kmc_tools\kmc_tools.vcxproj", "{F3B0CC94-9DD0-4642-891C-EA08BDA50260}"
+EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Mixed Platforms = Debug|Mixed Platforms
@@ -52,6 +54,17 @@ Global
{17823F37-86DE-4E58-B354-B84DA9EDA6A1}.Release|Win32.ActiveCfg = Release|Win32
{17823F37-86DE-4E58-B354-B84DA9EDA6A1}.Release|Win32.Build.0 = Release|Win32
{17823F37-86DE-4E58-B354-B84DA9EDA6A1}.Release|x64.ActiveCfg = Release|x64
+ {F3B0CC94-9DD0-4642-891C-EA08BDA50260}.Debug|Mixed Platforms.ActiveCfg = Debug|Win32
+ {F3B0CC94-9DD0-4642-891C-EA08BDA50260}.Debug|Mixed Platforms.Build.0 = Debug|Win32
+ {F3B0CC94-9DD0-4642-891C-EA08BDA50260}.Debug|Win32.ActiveCfg = Debug|Win32
+ {F3B0CC94-9DD0-4642-891C-EA08BDA50260}.Debug|Win32.Build.0 = Debug|Win32
+ {F3B0CC94-9DD0-4642-891C-EA08BDA50260}.Debug|x64.ActiveCfg = Debug|x64
+ {F3B0CC94-9DD0-4642-891C-EA08BDA50260}.Release|Mixed Platforms.ActiveCfg = Release|Win32
+ {F3B0CC94-9DD0-4642-891C-EA08BDA50260}.Release|Mixed Platforms.Build.0 = Release|Win32
+ {F3B0CC94-9DD0-4642-891C-EA08BDA50260}.Release|Win32.ActiveCfg = Release|Win32
+ {F3B0CC94-9DD0-4642-891C-EA08BDA50260}.Release|Win32.Build.0 = Release|Win32
+ {F3B0CC94-9DD0-4642-891C-EA08BDA50260}.Release|x64.ActiveCfg = Release|x64
+ {F3B0CC94-9DD0-4642-891C-EA08BDA50260}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
diff --git a/kmer_counter/asmlib_wrapper.h b/kmer_counter/asmlib_wrapper.h
index 2fd2794..1ee006e 100644
--- a/kmer_counter/asmlib_wrapper.h
+++ b/kmer_counter/asmlib_wrapper.h
@@ -4,8 +4,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#ifndef _ASMLIB_WRAPPER_H
diff --git a/kmer_counter/bkb_merger.h b/kmer_counter/bkb_merger.h
index 3b95462..bb2294c 100644
--- a/kmer_counter/bkb_merger.h
+++ b/kmer_counter/bkb_merger.h
@@ -4,8 +4,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#ifndef _HBH_MERGER_H
@@ -51,8 +51,8 @@ CBigKmerBinMerger<KMER_T,SIZE>::CBigKmerBinMerger(CKMCParams& Params, CKMCQueues
kmer_len = Params.kmer_len;
lut_prefix_len = Params.lut_prefix_len;
cutoff_min = Params.cutoff_min;
- cutoff_max = Params.cutoff_max;
- counter_max = Params.counter_max;
+ cutoff_max = (int32)Params.cutoff_max;
+ counter_max = (int32)Params.counter_max;
sm_pmm_merger_suff = Queues.sm_pmm_merger_suff;
sm_pmm_merger_lut = Queues.sm_pmm_merger_lut;
sm_pmm_sub_bin_suff = Queues.sm_pmm_sub_bin_suff;
diff --git a/kmer_counter/bkb_reader.cpp b/kmer_counter/bkb_reader.cpp
index 67d77c4..eb1e307 100644
--- a/kmer_counter/bkb_reader.cpp
+++ b/kmer_counter/bkb_reader.cpp
@@ -4,8 +4,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#include "stdafx.h"
diff --git a/kmer_counter/bkb_reader.h b/kmer_counter/bkb_reader.h
index b04e8c3..ca5dc26 100644
--- a/kmer_counter/bkb_reader.h
+++ b/kmer_counter/bkb_reader.h
@@ -5,8 +5,8 @@ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
-Version: 2.2.0
-Date : 2015-04-15
+Version: 2.3.0
+Date : 2015-08-21
*/
#ifndef _BKB_READER_H_
diff --git a/kmer_counter/bkb_sorter.h b/kmer_counter/bkb_sorter.h
index e52d45a..6a7bb2b 100644
--- a/kmer_counter/bkb_sorter.h
+++ b/kmer_counter/bkb_sorter.h
@@ -4,8 +4,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#ifndef _BKB_SORTER_H
@@ -185,9 +185,9 @@ CBigKmerBinSorter<KMER_T, SIZE>::CBigKmerBinSorter(CKMCParams& Params, CKMCQueue
n_omp_threads = Params.sm_n_omp_threads;
sum_n_rec = sum_n_plus_x_rec = 0;
- cutoff_max = Params.cutoff_max;
+ cutoff_max = (int32)Params.cutoff_max;
cutoff_min = Params.cutoff_min;
- counter_max = Params.counter_max;
+ counter_max = (int32)Params.counter_max;
}
//----------------------------------------------------------------------------------
diff --git a/kmer_counter/bkb_subbin.h b/kmer_counter/bkb_subbin.h
index c814852..f321d6d 100644
--- a/kmer_counter/bkb_subbin.h
+++ b/kmer_counter/bkb_subbin.h
@@ -4,8 +4,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#ifndef _BKB_SUBBIN_H
diff --git a/kmer_counter/bkb_uncompactor.h b/kmer_counter/bkb_uncompactor.h
index ab375e0..8042f01 100644
--- a/kmer_counter/bkb_uncompactor.h
+++ b/kmer_counter/bkb_uncompactor.h
@@ -4,8 +4,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#ifndef _BKB_UNCOMPACTOR_H
diff --git a/kmer_counter/bkb_writer.cpp b/kmer_counter/bkb_writer.cpp
index c86d66b..2943ad6 100644
--- a/kmer_counter/bkb_writer.cpp
+++ b/kmer_counter/bkb_writer.cpp
@@ -4,8 +4,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#include "stdafx.h"
diff --git a/kmer_counter/bkb_writer.h b/kmer_counter/bkb_writer.h
index 5d064da..c896138 100644
--- a/kmer_counter/bkb_writer.h
+++ b/kmer_counter/bkb_writer.h
@@ -4,8 +4,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#ifndef _BKB_WRITER_H
diff --git a/kmer_counter/defs.h b/kmer_counter/defs.h
index dc10c50..5da5bdb 100644
--- a/kmer_counter/defs.h
+++ b/kmer_counter/defs.h
@@ -4,15 +4,15 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#ifndef _DEFS_H
#define _DEFS_H
-#define KMC_VER "2.2.0"
-#define KMC_DATE "2015-04-15"
+#define KMC_VER "2.3.0"
+#define KMC_DATE "2015-08-21"
#define _CRT_SECURE_NO_WARNINGS
@@ -46,7 +46,7 @@
#define MAX_K 256
#endif
-#define MIN_K 10
+#define MIN_K 1
#define MIN_MEM 1
@@ -135,6 +135,9 @@ const int32 MAX_STR_LEN = 32768;
#define BYTE_LOG(x) (((x) < (1 << 8)) ? 1 : ((x) < (1 << 16)) ? 2 : ((x) < (1 << 24)) ? 3 : 4)
+#define BYTE_LOG_ULL(x) (((x) < (1ull << 8)) ? 1 : ((x) < (1ull << 16)) ? 2 : ((x) < (1ull << 24)) ? 3 : ((x) < (1ull << 32)) ? 4 : ((x) < (1ull << 40) ? 5 : ((x) < (1ull << 48) ? 6 : ((x) < (1ull << 56)) ? 7 : 8)))
+
+
#endif
// ***** EOF
diff --git a/kmer_counter/fastq_reader.cpp b/kmer_counter/fastq_reader.cpp
index 844a113..1026096 100644
--- a/kmer_counter/fastq_reader.cpp
+++ b/kmer_counter/fastq_reader.cpp
@@ -5,8 +5,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#include <algorithm>
diff --git a/kmer_counter/fastq_reader.h b/kmer_counter/fastq_reader.h
index 5132d23..302f7aa 100644
--- a/kmer_counter/fastq_reader.h
+++ b/kmer_counter/fastq_reader.h
@@ -4,8 +4,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#ifndef _FASTQ_READER_H
diff --git a/kmer_counter/kb_collector.h b/kmer_counter/kb_collector.h
index 1b2278b..dc59d94 100644
--- a/kmer_counter/kb_collector.h
+++ b/kmer_counter/kb_collector.h
@@ -4,8 +4,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#ifndef _KB_COLLECTOR_H
diff --git a/kmer_counter/kb_completer.cpp b/kmer_counter/kb_completer.cpp
index babf0fa..fc06b1e 100644
--- a/kmer_counter/kb_completer.cpp
+++ b/kmer_counter/kb_completer.cpp
@@ -5,8 +5,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#include <algorithm>
#include <numeric>
@@ -43,9 +43,11 @@ CKmerBinCompleter::CKmerBinCompleter(CKMCParams &Params, CKMCQueues &Queues)
signature_len = Params.signature_len;
cutoff_min = Params.cutoff_min;
- cutoff_max = Params.cutoff_max;
- counter_max = Params.counter_max;
+ cutoff_max = (int32)Params.cutoff_max;
+ counter_max = (int32)Params.counter_max;
lut_prefix_len = Params.lut_prefix_len;
+ both_strands = Params.both_strands;
+
kmer_t_size = Params.KMER_T_size;
@@ -242,11 +244,13 @@ void CKmerBinCompleter::ProcessBinsSecondStage()
store_uint(out_lut, cutoff_max, 4); offset += 4;
store_uint(out_lut, n_unique - n_cutoff_min - n_cutoff_max, 8); offset += 8;
+ store_uint(out_lut, both_strands ? 0 : 1, 1); offset++;
+
// Space for future use
- for (int32 i = 0; i < 7; ++i)
+ for (int32 i = 0; i < 27; ++i)
{
- store_uint(out_lut, 0, 4);
- offset += 4;
+ store_uint(out_lut, 0, 1);
+ offset ++;
}
store_uint(out_lut, 0x200, 4);
diff --git a/kmer_counter/kb_completer.h b/kmer_counter/kb_completer.h
index fd01f5c..bfa57bf 100644
--- a/kmer_counter/kb_completer.h
+++ b/kmer_counter/kb_completer.h
@@ -4,8 +4,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#ifndef _KB_COMPLETER_H
#define _KB_COMPLETER_H
@@ -19,7 +19,7 @@
#include <numeric>
#include <array>
#include <stdio.h>
-
+#include "small_k_buf.h"
//************************************************************************************************************
// CKmerBinCompleter - complete the sorted bins and store in a file
@@ -53,7 +53,7 @@ class CKmerBinCompleter {
int32 kmer_len;
int32 signature_len;
bool use_quake;
-
+ bool both_strands;
bool store_uint(FILE *out, uint64 x, uint32 size);
public:
@@ -83,6 +83,214 @@ public:
void InitStage2(CKMCParams& Params, CKMCQueues& Queues);
};
+
+//************************************************************************************************************
+// SmallKCompleter - completer for small k optimization
+//************************************************************************************************************
+template<bool QUAKE_MODE>
+class CSmallKCompleter
+{
+ CMemoryPool *pmm_small_k_completer;
+ uint64 n_unique, n_cutoff_min, n_cutoff_max;
+ uint32 lut_prefix_len;
+ int64 cutoff_max, counter_max;
+ int cutoff_min;
+ uint32 kmer_len;
+ int64 mem_tot_small_k_completer;
+ std::string output_file_name;
+ bool both_strands;
+ bool use_quake;
+
+ bool store_uint(FILE *out, uint64 x, uint32 size);
+public:
+ CSmallKCompleter(CKMCParams& Params, CKMCQueues& Queues);
+
+ template<typename COUNTER_TYPE>
+ bool Complete(CSmallKBuf<COUNTER_TYPE> results);
+ void GetTotal(uint64 &_n_unique, uint64 &_n_cutoff_min, uint64 &_n_cutoff_max);
+
+};
+
+template<bool QUAKE_MODE>
+CSmallKCompleter<QUAKE_MODE>::CSmallKCompleter(CKMCParams& Params, CKMCQueues& Queues)
+{
+ pmm_small_k_completer = Queues.pmm_small_k_completer;
+ n_unique = n_cutoff_min = n_cutoff_max = 0;
+ lut_prefix_len = Params.lut_prefix_len;
+ cutoff_max = Params.cutoff_max;
+ cutoff_min = Params.cutoff_min;
+ counter_max = Params.counter_max;
+ both_strands = Params.both_strands;
+ kmer_len = (uint32)Params.kmer_len;
+ use_quake = Params.use_quake;
+
+ mem_tot_small_k_completer = Params.mem_tot_small_k_completer;
+ output_file_name = Params.output_file_name;
+}
+
+template<bool QUAKE_MODE>
+bool CSmallKCompleter<QUAKE_MODE>::store_uint(FILE *out, uint64 x, uint32 size)
+{
+ for (uint32 i = 0; i < size; ++i)
+ putc((x >> (i * 8)) & 0xFF, out);
+
+ return true;
+}
+
+template<bool QUAKE_MODE>
+template<typename COUNTER_TYPE>
+bool CSmallKCompleter<QUAKE_MODE>::Complete(CSmallKBuf<COUNTER_TYPE> result)
+{
+ uchar* raw_buffer;
+ uint64 counter_size = 0;
+ if (use_quake)
+ counter_size = 4;
+ else
+ counter_size = min(BYTE_LOG_ULL((uint64)cutoff_max), BYTE_LOG_ULL((uint64)counter_max));
+ uint64 kmer_suf_bytes = (kmer_len - lut_prefix_len) / 4;
+
+
+ pmm_small_k_completer->reserve(raw_buffer);
+ uint32 lut_recs = (1 << 2 * lut_prefix_len);
+ uint32 lut_buf_recs = (uint32)(MIN(lut_recs * sizeof(uint64), (uint64)mem_tot_small_k_completer / 2) / sizeof(uint64));
+ uint32 lut_buf_pos = 0;
+ uint32 suf_size = (uint32)(mem_tot_small_k_completer - lut_buf_recs * sizeof(uint64));
+ uint32 suf_recs = (uint32)(suf_size / (counter_size + kmer_suf_bytes) * (counter_size + kmer_suf_bytes));
+ uint32 suf_pos = 0;
+
+ uint64* lut = (uint64*)raw_buffer;
+ uchar* suf = raw_buffer + lut_buf_recs * sizeof(uint64);
+
+ FILE* suf_file, *pre_file;
+
+ string pre_file_name = output_file_name + ".kmc_pre";
+ string suf_file_name = output_file_name + ".kmc_suf";
+
+ pre_file = fopen(pre_file_name.c_str(), "wb");
+ if (!pre_file)
+ {
+ cout << "Error: Cannot create " << pre_file_name << "\n";
+ exit(1);
+ return false;
+ }
+ suf_file = fopen(suf_file_name.c_str(), "wb");
+
+ if (!suf_file)
+ {
+ cout << "Error: Cannot create " << suf_file_name << "\n";
+ fclose(pre_file);
+ exit(1);
+ return false;
+ }
+
+ char s_kmc_pre[] = "KMCP";
+ char s_kmc_suf[] = "KMCS";
+
+ // Markers at the beginning
+ fwrite(s_kmc_pre, 1, 4, pre_file);
+ fwrite(s_kmc_suf, 1, 4, suf_file);
+
+
+ CKmer<1> kmer;
+
+ uint64 prev_prefix = 0, prefix;
+
+ lut[lut_buf_pos++] = 0;
+ uint64 kmer_no = 0;
+ for (kmer.data = 0; kmer.data < (1ull << 2 * kmer_len); ++kmer.data)
+ {
+ prefix = kmer.remove_suffix(2 * (kmer_len - lut_prefix_len));
+
+ if (prefix != prev_prefix) //new prefix
+ {
+ prev_prefix = prefix;
+ lut[lut_buf_pos++] = kmer_no;
+ if (lut_buf_pos >= lut_buf_recs)
+ {
+ fwrite(lut, sizeof(uint64), lut_buf_pos, pre_file);
+ lut_buf_pos = 0;
+ }
+ }
+
+ if (result.buf[kmer.data]) //k-mer exists
+ {
+ ++n_unique;
+
+ if (result.buf[kmer.data] < (uint32)cutoff_min)
+ ++n_cutoff_min;
+ else if (result.buf[kmer.data] > (uint64)cutoff_max)
+ ++n_cutoff_max;
+ else
+ {
+ ++kmer_no;
+ if (result.buf[kmer.data] > (uint64)counter_max)
+ result.buf[kmer.data] = (COUNTER_TYPE)counter_max;
+
+ for (int32 j = (int32)kmer_suf_bytes - 1; j >= 0; --j)
+ suf[suf_pos++] = kmer.get_byte(j);
+
+ result.Store(kmer.data, suf, suf_pos, counter_size);
+
+ if (suf_pos >= suf_recs * (kmer_suf_bytes + counter_size))
+ {
+ fwrite(suf, 1, suf_pos, suf_file);
+ suf_pos = 0;
+ }
+ }
+ }
+ }
+
+
+ fwrite(lut, sizeof(uint64), lut_buf_pos, pre_file);
+ fwrite(suf, 1, suf_pos, suf_file);
+
+ uint32 offset = 0;
+
+ store_uint(pre_file, kmer_len, 4); offset += 4;
+ store_uint(pre_file, (uint32)use_quake, 4); offset += 4; // mode: 0 (counting), 1 (Quake-compatibile counting)
+ store_uint(pre_file, counter_size, 4); offset += 4;
+ store_uint(pre_file, lut_prefix_len, 4); offset += 4;
+ store_uint(pre_file, cutoff_min, 4); offset += 4;
+ store_uint(pre_file, cutoff_max, 4); offset += 4;
+ store_uint(pre_file, n_unique - n_cutoff_min - n_cutoff_max, 8); offset += 8;
+
+
+ store_uint(pre_file, both_strands ? 0 : 1, 1); offset++;
+
+ store_uint(pre_file, 0, 1); offset++;
+ store_uint(pre_file, 0, 1); offset++;
+ store_uint(pre_file, 0, 1); offset++;
+
+ store_uint(pre_file, cutoff_max >> 32, 4); offset += 4;
+ // Space for future use
+ for (int32 i = 0; i < 20; ++i)
+ {
+ store_uint(pre_file, 0, 1);
+ offset++;
+ }
+
+ store_uint(pre_file, 0x0, 4); //KMC 1.x format
+ offset += 4;
+
+ store_uint(pre_file, offset, 4);
+
+ // Markers at the end
+ fwrite(s_kmc_pre, 1, 4, pre_file);
+ fwrite(s_kmc_suf, 1, 4, suf_file);
+ fclose(pre_file);
+ fclose(suf_file);
+ pmm_small_k_completer->free(raw_buffer);
+
+ return true;
+}
+
+template<bool QUAKE_MODE>
+void CSmallKCompleter<QUAKE_MODE>::GetTotal(uint64 &_n_unique, uint64 &_n_cutoff_min, uint64 &_n_cutoff_max)
+{
+ _n_unique = n_unique;
+ _n_cutoff_min = n_cutoff_min;
+ _n_cutoff_max = n_cutoff_max;
+}
#endif
// ***** EOF
diff --git a/kmer_counter/kb_reader.h b/kmer_counter/kb_reader.h
index e91f3f9..edff8d4 100644
--- a/kmer_counter/kb_reader.h
+++ b/kmer_counter/kb_reader.h
@@ -4,8 +4,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#ifndef _KB_READER_H
@@ -75,8 +75,8 @@ template <typename KMER_T, unsigned SIZE> CKmerBinReader<KMER_T, SIZE>::CKmerBin
kmer_len = Params.kmer_len;
cutoff_min = Params.cutoff_min;
- cutoff_max = Params.cutoff_max;
- counter_max = Params.counter_max;
+ cutoff_max = (int32)Params.cutoff_max;
+ counter_max = (int32)Params.counter_max;
both_strands = Params.both_strands;
use_quake = Params.use_quake;
max_x = Params.max_x;
diff --git a/kmer_counter/kb_sorter.h b/kmer_counter/kb_sorter.h
index 8feb565..85383a2 100644
--- a/kmer_counter/kb_sorter.h
+++ b/kmer_counter/kb_sorter.h
@@ -4,8 +4,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#ifndef _KB_SORTER_H
@@ -14,6 +14,7 @@
#define DEBUGG_INFO
#include "defs.h"
+#include "prob_qual.h"
#include "params.h"
#include "kmer.h"
#include "radix.h"
@@ -129,71 +130,11 @@ public:
};
template <unsigned SIZE> class CKmerBinSorter_Impl<CKmerQuake<SIZE>, SIZE> {
- static double prob_qual[94];
- static double inv_prob_qual[94];
- static double MIN_PROB_QUAL_VALUE;
public:
static void Compact(CKmerBinSorter<CKmerQuake<SIZE>, SIZE> &ptr);
static void Expand(CKmerBinSorter<CKmerQuake<SIZE>, SIZE> &ptr, uint64 tmp_size);
};
-// K-mers with probability less than MIN_PROB_QUAL_VALUE will not be counted
-template <unsigned SIZE> double CKmerBinSorter_Impl<CKmerQuake<SIZE>, SIZE>::MIN_PROB_QUAL_VALUE = 0.0000;
-
-
-template <unsigned SIZE> double CKmerBinSorter_Impl<CKmerQuake<SIZE>, SIZE>::prob_qual[94] = {
- 0.2500000000000000, 0.2500000000000000, 0.3690426555198070, 0.4988127663727280,
- 0.6018928294465030, 0.6837722339831620, 0.7488113568490420, 0.8004737685031120,
- 0.8415106807538890, 0.8741074588205830, 0.9000000000000000, 0.9205671765275720,
- 0.9369042655519810, 0.9498812766372730, 0.9601892829446500, 0.9683772233983160,
- 0.9748811356849040, 0.9800473768503110, 0.9841510680753890, 0.9874107458820580,
- 0.9900000000000000, 0.9920567176527570, 0.9936904265551980, 0.9949881276637270,
- 0.9960189282944650, 0.9968377223398320, 0.9974881135684900, 0.9980047376850310,
- 0.9984151068075390, 0.9987410745882060, 0.9990000000000000, 0.9992056717652760,
- 0.9993690426555200, 0.9994988127663730, 0.9996018928294460, 0.9996837722339830,
- 0.9997488113568490, 0.9998004737685030, 0.9998415106807540, 0.9998741074588210,
- 0.9999000000000000, 0.9999205671765280, 0.9999369042655520, 0.9999498812766370,
- 0.9999601892829450, 0.9999683772233980, 0.9999748811356850, 0.9999800473768500,
- 0.9999841510680750, 0.9999874107458820, 0.9999900000000000, 0.9999920567176530,
- 0.9999936904265550, 0.9999949881276640, 0.9999960189282940, 0.9999968377223400,
- 0.9999974881135680, 0.9999980047376850, 0.9999984151068080, 0.9999987410745880,
- 0.9999990000000000, 0.9999992056717650, 0.9999993690426560, 0.9999994988127660,
- 0.9999996018928290, 0.9999996837722340, 0.9999997488113570, 0.9999998004737680,
- 0.9999998415106810, 0.9999998741074590, 0.9999999000000000, 0.9999999205671770,
- 0.9999999369042660, 0.9999999498812770, 0.9999999601892830, 0.9999999683772230,
- 0.9999999748811360, 0.9999999800473770, 0.9999999841510680, 0.9999999874107460,
- 0.9999999900000000, 0.9999999920567180, 0.9999999936904270, 0.9999999949881280,
- 0.9999999960189280, 0.9999999968377220, 0.9999999974881140, 0.9999999980047380,
- 0.9999999984151070, 0.9999999987410750, 0.9999999990000000, 0.9999999992056720,
- 0.9999999993690430, 0.9999999994988130 };
-
-template <unsigned SIZE> double CKmerBinSorter_Impl<CKmerQuake<SIZE>, SIZE>::inv_prob_qual[94] = {
- 4.0000000000000000, 4.0000000000000000, 2.7097138638119600, 2.0047602375372500,
- 1.6614253419825500, 1.4624752955742600, 1.3354498310601800, 1.2492601748462100,
- 1.1883390465158700, 1.1440241012807300, 1.1111111111111100, 1.0862868300084900,
- 1.0673449110735400, 1.0527631448218000, 1.0414613220148200, 1.0326554320337200,
- 1.0257660789563300, 1.0203588353185700, 1.0161041657513100, 1.0127497641386300,
- 1.0101010101010100, 1.0080068832818700, 1.0063496369454600, 1.0050371177272600,
- 1.0039969839853900, 1.0031723093832600, 1.0025182118938000, 1.0019992513458400,
- 1.0015874090662800, 1.0012605123027600, 1.0010010010010000, 1.0007949596936500,
- 1.0006313557030000, 1.0005014385482300, 1.0003982657229900, 1.0003163277976500,
- 1.0002512517547400, 1.0001995660501600, 1.0001585144420900, 1.0001259083921100,
- 1.0001000100010000, 1.0000794391335500, 1.0000630997157700, 1.0000501212353700,
- 1.0000398123020100, 1.0000316237766300, 1.0000251194952900, 1.0000199530212600,
- 1.0000158491831200, 1.0000125894126100, 1.0000100001000000, 1.0000079433454400,
- 1.0000063096132600, 1.0000050118974600, 1.0000039810875500, 1.0000031622876600,
- 1.0000025118927400, 1.0000019952663000, 1.0000015848957000, 1.0000012589270000,
- 1.0000010000010000, 1.0000007943288700, 1.0000006309577400, 1.0000005011874800,
- 1.0000003981073300, 1.0000003162278700, 1.0000002511887100, 1.0000001995262700,
- 1.0000001584893400, 1.0000001258925600, 1.0000001000000100, 1.0000000794328300,
- 1.0000000630957400, 1.0000000501187300, 1.0000000398107200, 1.0000000316227800,
- 1.0000000251188600, 1.0000000199526200, 1.0000000158489300, 1.0000000125892500,
- 1.0000000100000000, 1.0000000079432800, 1.0000000063095700, 1.0000000050118700,
- 1.0000000039810700, 1.0000000031622800, 1.0000000025118900, 1.0000000019952600,
- 1.0000000015848900, 1.0000000012589300, 1.0000000010000000, 1.0000000007943300,
- 1.0000000006309600, 1.0000000005011900 };
-
-
//************************************************************************************************************
// CKmerBinSorter
//************************************************************************************************************
@@ -219,8 +160,8 @@ template <typename KMER_T, unsigned SIZE> CKmerBinSorter<KMER_T, SIZE>::CKmerBin
memory_bins = Queues.memory_bins;
cutoff_min = Params.cutoff_min;
- cutoff_max = Params.cutoff_max;
- counter_max = Params.counter_max;
+ cutoff_max = (int32)Params.cutoff_max;
+ counter_max = (int32)Params.counter_max;
max_x = Params.max_x;
use_quake = Params.use_quake;
@@ -768,14 +709,14 @@ template <unsigned SIZE> void CKmerBinSorter_Impl<CKmerQuake<SIZE>, SIZE>::Expan
symb = (data_p[pos] >> 6) & 3;
qual = data_p[pos++] & 63;
- inv_probs[inv_probs_pos++] = inv_prob_qual[qual];
+ inv_probs[inv_probs_pos++] = CProbQual::inv_prob_qual[qual];
current_kmer.SHL_insert_2bits(symb);
kmer_rev.SHR_insert_2bits(3 - symb, kmer_len_shift);
- kmer_prob *= prob_qual[qual];
+ kmer_prob *= CProbQual::prob_qual[qual];
}
current_kmer.mask(kmer_mask);
- if (kmer_prob >= MIN_PROB_QUAL_VALUE)
+ if (kmer_prob >= CProbQual::MIN_PROB_QUAL_VALUE)
{
kmer_can = current_kmer < kmer_rev ? current_kmer : kmer_rev;
kmer_can.quality = (float)kmer_prob;
@@ -790,9 +731,9 @@ template <unsigned SIZE> void CKmerBinSorter_Impl<CKmerQuake<SIZE>, SIZE>::Expan
current_kmer.mask(kmer_mask);
kmer_rev.SHR_insert_2bits(3 - symb, kmer_len_shift);
- kmer_prob *= prob_qual[qual] * inv_probs[inv_probs_pos - ptr.kmer_len];
- inv_probs[inv_probs_pos++] = inv_prob_qual[qual];
- if (kmer_prob >= MIN_PROB_QUAL_VALUE)
+ kmer_prob *= CProbQual::prob_qual[qual] * inv_probs[inv_probs_pos - ptr.kmer_len];
+ inv_probs[inv_probs_pos++] = CProbQual::inv_prob_qual[qual];
+ if (kmer_prob >= CProbQual::MIN_PROB_QUAL_VALUE)
{
kmer_can = current_kmer < kmer_rev ? current_kmer : kmer_rev;
kmer_can.quality = (float)kmer_prob;
@@ -812,13 +753,13 @@ template <unsigned SIZE> void CKmerBinSorter_Impl<CKmerQuake<SIZE>, SIZE>::Expan
symb = (data_p[pos] >> 6) & 3;
qual = data_p[pos++] & 63;
- inv_probs[inv_probs_pos++] = inv_prob_qual[qual];
+ inv_probs[inv_probs_pos++] = CProbQual::inv_prob_qual[qual];
current_kmer.SHL_insert_2bits(symb);
- kmer_prob *= prob_qual[qual];
+ kmer_prob *= CProbQual::prob_qual[qual];
}
current_kmer.mask(kmer_mask);
- if (kmer_prob >= MIN_PROB_QUAL_VALUE)
+ if (kmer_prob >= CProbQual::MIN_PROB_QUAL_VALUE)
{
current_kmer.quality = (float)kmer_prob;
ptr.buffer_input[ptr.input_pos++].set(current_kmer);
@@ -831,9 +772,9 @@ template <unsigned SIZE> void CKmerBinSorter_Impl<CKmerQuake<SIZE>, SIZE>::Expan
current_kmer.SHL_insert_2bits(symb);
current_kmer.mask(kmer_mask);
- kmer_prob *= prob_qual[qual] * inv_probs[inv_probs_pos - ptr.kmer_len];
- inv_probs[inv_probs_pos++] = inv_prob_qual[qual];
- if (kmer_prob >= MIN_PROB_QUAL_VALUE)
+ kmer_prob *= CProbQual::prob_qual[qual] * inv_probs[inv_probs_pos - ptr.kmer_len];
+ inv_probs[inv_probs_pos++] = CProbQual::inv_prob_qual[qual];
+ if (kmer_prob >= CProbQual::MIN_PROB_QUAL_VALUE)
{
current_kmer.quality = (float)kmer_prob;
ptr.buffer_input[ptr.input_pos++].set(current_kmer);
diff --git a/kmer_counter/kb_storer.cpp b/kmer_counter/kb_storer.cpp
index 7cccf1d..a225c37 100644
--- a/kmer_counter/kb_storer.cpp
+++ b/kmer_counter/kb_storer.cpp
@@ -5,8 +5,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#include <algorithm>
#include <numeric>
diff --git a/kmer_counter/kb_storer.h b/kmer_counter/kb_storer.h
index 4067116..7f6978d 100644
--- a/kmer_counter/kb_storer.h
+++ b/kmer_counter/kb_storer.h
@@ -4,8 +4,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#ifndef _KB_STORER_H
diff --git a/kmer_counter/kmc.h b/kmer_counter/kmc.h
index a11241f..197cae1 100644
--- a/kmer_counter/kmc.h
+++ b/kmer_counter/kmc.h
@@ -4,8 +4,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#ifndef _KMC_H
@@ -43,6 +43,8 @@
using namespace std;
+template<typename KMER_T, unsigned SIZE, bool QUAKE_MODE>
+class CSmallKWrapper;
template <typename KMER_T, unsigned SIZE, bool QUAKE_MODE> class CKMC {
bool initialized;
@@ -67,6 +69,7 @@ template <typename KMER_T, unsigned SIZE, bool QUAKE_MODE> class CKMC {
vector<CWStatsSplitter<false>*> w_stats_splitters;
vector<CWFastqReader*> w_fastqs;
vector<CWSplitter<QUAKE_MODE>*> w_splitters;
+
CWKmerBinStorer *w_storer;
CWKmerBinReader<KMER_T, SIZE>* w_reader;
@@ -84,6 +87,9 @@ template <typename KMER_T, unsigned SIZE, bool QUAKE_MODE> class CKMC {
void ShowSettingsStage1();
void ShowSettingsStage2();
+ friend class CSmallKWrapper<KMER_T, SIZE, QUAKE_MODE>;
+ bool AdjustMemoryLimitsSmallK();
+ template<typename COUNTER_TYPE> bool ProcessSmallKOptimization();
public:
CKMC();
@@ -95,6 +101,35 @@ public:
};
+template<typename KMER_T, unsigned SIZE>
+class CSmallKWrapper<KMER_T, SIZE, true>
+{
+public:
+ static bool Process(CKMC<KMER_T, SIZE, true>& ptr);
+};
+
+template<typename KMER_T, unsigned SIZE>
+class CSmallKWrapper<KMER_T, SIZE, false>
+{
+public:
+ static bool Process(CKMC<KMER_T, SIZE, false>& ptr);
+};
+
+template<typename KMER_T, unsigned SIZE>
+bool CSmallKWrapper<KMER_T, SIZE, true>::Process(CKMC<KMER_T, SIZE, true>& ptr)
+{
+ return ptr.template ProcessSmallKOptimization<float>();
+}
+
+template<typename KMER_T, unsigned SIZE>
+bool CSmallKWrapper<KMER_T, SIZE, false>::Process(CKMC<KMER_T, SIZE, false>& ptr)
+{
+ if ((uint64)ptr.Params.cutoff_max > ((1ull << 32) - 1))
+ return ptr.template ProcessSmallKOptimization<uint64>();
+ else
+ return ptr.template ProcessSmallKOptimization<uint32>();
+}
+
//----------------------------------------------------------------------------------
template <typename KMER_T, unsigned SIZE, bool QUAKE_MODE> CKMC<KMER_T, SIZE, QUAKE_MODE>::CKMC()
{
@@ -483,9 +518,223 @@ template <typename KMER_T, unsigned SIZE, bool QUAKE_MODE> void CKMC<KMER_T, SIZ
}
//----------------------------------------------------------------------------------
+template <typename KMER_T, unsigned SIZE, bool QUAKE_MODE> bool CKMC<KMER_T, SIZE, QUAKE_MODE>::AdjustMemoryLimitsSmallK()
+{
+ if (Params.kmer_len > 13)
+ return false;
+
+ uint32 counter_size = 4; //in bytes
+ if ((uint64)Params.cutoff_max > ((1ull << 32) - 1))
+ counter_size = 8;
+
+ int tmp_n_splitters = Params.n_splitters;
+ int tmp_n_readers = Params.n_readers;
+ int tmp_fastq_buffer_size = 0;
+ int64 tmp_mem_part_pmm_fastq = 0;
+ int64 tmp_mem_tot_pmm_fastq = 0;
+ int64 tmp_mem_part_pmm_reads = (CSplitter<QUAKE_MODE>::MAX_LINE_SIZE + 1) * sizeof(double);
+ int64 tmp_mem_tot_pmm_reads = 0;
+ int32 tmp_gzip_buffer_size = Params.gzip_buffer_size;
+
+ int64 tmp_mem_part_small_k_buf = (1ll << 2 * Params.kmer_len) * counter_size;//no of possible k-mers * counter size
+ int64 tmp_mem_tot_small_k_buf = 0;
+
+ int64 mim_mem_for_readers = tmp_n_readers * (16 << 20);
+
+ while (tmp_n_splitters)
+ {
+ tmp_mem_tot_pmm_reads = tmp_mem_part_pmm_reads * 3 * tmp_n_splitters;
+ tmp_mem_tot_small_k_buf = tmp_mem_part_small_k_buf * tmp_n_splitters;
+
+ if (tmp_mem_tot_pmm_reads + tmp_mem_tot_small_k_buf + mim_mem_for_readers < Params.max_mem_size)
+ break;
+
+ --tmp_n_splitters;
+ }
+
+ if (!tmp_n_splitters)
+ return false;
+
+ int64 left_for_readers = Params.max_mem_size - tmp_mem_tot_pmm_reads - tmp_mem_tot_small_k_buf;
+
+ int64 max_for_gzip = (int64)(0.66 * left_for_readers);
+ while (tmp_n_readers * tmp_gzip_buffer_size > max_for_gzip)
+ tmp_gzip_buffer_size /= 2;
+
+ int64 for_pmm_fastq = left_for_readers - tmp_n_readers * tmp_gzip_buffer_size;
+
+
+ tmp_fastq_buffer_size = 32 << 20;
+ do {
+ if (tmp_fastq_buffer_size & (tmp_fastq_buffer_size - 1))
+ tmp_fastq_buffer_size &= tmp_fastq_buffer_size - 1;
+ else
+ tmp_fastq_buffer_size = tmp_fastq_buffer_size / 2 + tmp_fastq_buffer_size / 4;
+ tmp_mem_part_pmm_fastq = tmp_fastq_buffer_size + CFastqReader::OVERHEAD_SIZE;
+ tmp_mem_tot_pmm_fastq = tmp_mem_part_pmm_fastq * (tmp_n_readers + tmp_n_splitters + 96);
+ } while (tmp_mem_tot_pmm_fastq > for_pmm_fastq);
+
+ Params.n_splitters = tmp_n_splitters;
+ Params.n_readers = tmp_n_readers;
+ Params.fastq_buffer_size = tmp_fastq_buffer_size;
+ Params.mem_part_pmm_fastq = tmp_mem_part_pmm_fastq;
+ Params.mem_part_small_k_completer = Params.mem_tot_small_k_completer = Params.mem_tot_pmm_fastq = tmp_mem_tot_pmm_fastq;
+ Params.mem_part_pmm_reads = tmp_mem_part_pmm_reads;
+ Params.mem_tot_pmm_reads = tmp_mem_tot_pmm_reads;
+ Params.gzip_buffer_size = tmp_gzip_buffer_size;
+ Params.mem_part_small_k_buf = tmp_mem_part_small_k_buf;
+ Params.mem_tot_small_k_buf = tmp_mem_tot_small_k_buf;
+
+ return true;
+}
+
+//----------------------------------------------------------------------------------
+template <typename KMER_T, unsigned SIZE, bool QUAKE_MODE>
+template<typename COUNTER_TYPE>
+bool CKMC<KMER_T, SIZE, QUAKE_MODE>::ProcessSmallKOptimization()
+{
+ vector<CWSmallKSplitter<QUAKE_MODE, COUNTER_TYPE>*> w_small_k_splitters; //For small k values only
+
+ w1.startTimer();
+ Queues.input_files_queue = new CInputFilesQueue(Params.input_file_names);
+ Queues.part_queue = new CPartQueue(Params.n_readers);
+
+ Queues.pmm_fastq = new CMemoryPool(Params.mem_tot_pmm_fastq, Params.mem_part_pmm_fastq);
+ Queues.pmm_reads = new CMemoryPool(Params.mem_tot_pmm_reads, Params.mem_part_pmm_reads);
+ Queues.pmm_small_k_buf = new CMemoryPool(Params.mem_tot_small_k_buf, Params.mem_part_small_k_buf);
+
+ w_small_k_splitters.resize(Params.n_splitters);
+
+ for (int i = 0; i < Params.n_splitters; ++i)
+ {
+ w_small_k_splitters[i] = new CWSmallKSplitter<QUAKE_MODE, COUNTER_TYPE>(Params, Queues);
+ gr1_2.push_back(thread(std::ref(*w_small_k_splitters[i])));
+ }
+
+ w_fastqs.resize(Params.n_readers);
+ for (int i = 0; i < Params.n_readers; ++i)
+ {
+ w_fastqs[i] = new CWFastqReader(Params, Queues);
+ gr1_1.push_back(thread(std::ref(*w_fastqs[i])));
+ }
+
+ for (auto& t : gr1_1)
+ t.join();
+ for (auto& t : gr1_2)
+ t.join();
+
+ for (auto r : w_fastqs)
+ delete r;
+
+ vector<CSmallKBuf<COUNTER_TYPE>> results(Params.n_splitters);
+
+ for (int i = 0; i < Params.n_splitters; ++i)
+ {
+ results[i] = w_small_k_splitters[i]->GetResult();
+ }
+
+ w1.stopTimer();
+
+ w2.startTimer();
+
+ uint64 n_kmers = 0;
+
+ for (int j = 1; j < Params.n_splitters; ++j)
+ {
+ for (int i = 0; i < (1 << 2 * Params.kmer_len); ++i)
+ results[0].buf[i] += results[j].buf[i];
+ }
+
+ n_total = 0;
+
+
+ for (int j = 0; j < (1 << 2 * Params.kmer_len); ++j)
+ if (results[0].buf[j]) ++n_kmers;
+
+ uint64 tmp_n_reads;
+ tmp_size = 0;
+ n_reads = 0;
+ n_total_super_kmers = 0;
+ for (auto s : w_small_k_splitters)
+ {
+ s->GetTotal(tmp_n_reads);
+ n_reads += tmp_n_reads;
+ n_total += s->GetTotalKmers();
+ s->Release();
+ delete s;
+ }
+
+
+ Queues.pmm_fastq->release();
+ delete Queues.pmm_fastq;
+
+
+ uint32 best_lut_prefix_len = 0;
+ uint64 best_mem_amount = 1ull << 62;
+
+
+ uint32 counter_size = 0;
+ if (Params.use_quake)
+ counter_size = 4;
+ else
+ counter_size = min(BYTE_LOG(Params.cutoff_max), BYTE_LOG(Params.counter_max));
+
+ for (Params.lut_prefix_len = 1; Params.lut_prefix_len < 16; ++Params.lut_prefix_len)
+ {
+ uint32 suffix_len;
+ if (Params.lut_prefix_len > (uint32)Params.kmer_len)
+ suffix_len = 0;
+ else
+ suffix_len = Params.kmer_len - Params.lut_prefix_len;
+
+ if (suffix_len % 4)
+ continue;
+
+ uint64 suf_mem = n_kmers * (suffix_len / 4 + counter_size);
+ uint64 lut_mem = (1ull << (2 * Params.lut_prefix_len)) * sizeof(uint64);
+
+ if (suf_mem + lut_mem < best_mem_amount)
+ {
+ best_lut_prefix_len = Params.lut_prefix_len;
+ best_mem_amount = suf_mem + lut_mem;
+ }
+ }
+
+ Params.lut_prefix_len = best_lut_prefix_len;
+
+ Queues.pmm_small_k_completer = new CMemoryPool(Params.mem_tot_small_k_completer, Params.mem_part_small_k_completer);
+
+ CSmallKCompleter<QUAKE_MODE> small_k_completer(Params, Queues);
+ small_k_completer.Complete(results[0]);
+ small_k_completer.GetTotal(n_unique, n_cutoff_min, n_cutoff_max);
+
+ Queues.pmm_reads->release();
+ Queues.pmm_small_k_buf->release();
+ Queues.pmm_small_k_completer->release();
+ delete Queues.pmm_small_k_completer;
+ delete Queues.pmm_reads;
+ delete Queues.pmm_small_k_buf;
+ w2.stopTimer();
+ cout << "\n";
+ return true;
+}
+
+//----------------------------------------------------------------------------------
// Run the counter
template <typename KMER_T, unsigned SIZE, bool QUAKE_MODE> bool CKMC<KMER_T, SIZE, QUAKE_MODE>::Process()
{
+ if (!initialized)
+ return false;
+
+ if (AdjustMemoryLimitsSmallK())
+ {
+ if (Params.verbose)
+ {
+ cout << "\nSmall k optimization on!\n";
+ }
+ return CSmallKWrapper<KMER_T, SIZE, QUAKE_MODE>::Process(*this);
+ }
+
int32 bin_id;
CMemDiskFile *file;
string name;
@@ -494,9 +743,6 @@ template <typename KMER_T, unsigned SIZE, bool QUAKE_MODE> bool CKMC<KMER_T, SIZ
uint64 n_plus_x_recs;
uint64 n_super_kmers;
- if (!initialized)
- return false;
-
if (!AdjustMemoryLimits())
return false;
diff --git a/kmer_counter/kmer.cpp b/kmer_counter/kmer.cpp
index 6779a84..edc90fe 100644
--- a/kmer_counter/kmer.cpp
+++ b/kmer_counter/kmer.cpp
@@ -5,8 +5,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#include "defs.h"
diff --git a/kmer_counter/kmer.h b/kmer_counter/kmer.h
index be0e4a5..9b1a88c 100644
--- a/kmer_counter/kmer.h
+++ b/kmer_counter/kmer.h
@@ -4,8 +4,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#ifndef _KMER_H
diff --git a/kmer_counter/kmer_counter.cpp b/kmer_counter/kmer_counter.cpp
index df805dc..ba49f37 100644
--- a/kmer_counter/kmer_counter.cpp
+++ b/kmer_counter/kmer_counter.cpp
@@ -5,8 +5,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#include <fstream>
@@ -205,10 +205,10 @@ bool parse_parameters(int argc, char *argv[])
Params.p_ci = atoi(&argv[i][3]);
// Maximum counter threshold
else if(strncmp(argv[i], "-cx", 3) == 0)
- Params.p_cx = atoi(&argv[i][3]);
+ Params.p_cx = atoll(&argv[i][3]);
// Maximal counter value
else if(strncmp(argv[i], "-cs", 3) == 0)
- Params.p_cs = atoi(&argv[i][3]);
+ Params.p_cs = atoll(&argv[i][3]);
// Quake mode
else if(strncmp(argv[i], "-q", 2) == 0)
{
@@ -380,6 +380,21 @@ bool parse_parameters(int argc, char *argv[])
Params.p_strict_mem = false;
}
+
+ if (Params.p_k > 9)
+ {
+ if ((uint64)Params.p_cx > ((1ull << 32) - 1))
+ {
+ cout << "Warning: for k > 9 maximum value of -cx is 4294967295\n";
+ Params.p_cx = 4294967295;
+ }
+ if ((uint64)Params.p_cs > ((1ull << 32) - 1))
+ {
+ cout << "Warning: for k > 9 maximum value of -cs is 4294967295\n";
+ Params.p_cs = 4294967295;
+ }
+ }
+
return true;
}
diff --git a/kmer_counter/kmer_counter.vcxproj b/kmer_counter/kmer_counter.vcxproj
index 207867f..65d42fe 100644
--- a/kmer_counter/kmer_counter.vcxproj
+++ b/kmer_counter/kmer_counter.vcxproj
@@ -171,7 +171,6 @@
<ClInclude Include="bkb_subbin.h" />
<ClInclude Include="bkb_writer.h" />
<ClInclude Include="defs.h" />
- <ClInclude Include="develop.h" />
<ClInclude Include="fastq_reader.h" />
<ClInclude Include="bkb_uncompactor.h" />
<ClInclude Include="kb_collector.h" />
@@ -190,7 +189,9 @@
<ClInclude Include="mem_disk_file.h" />
<ClInclude Include="meta_oper.h" />
<ClInclude Include="mmer.h" />
+ <ClInclude Include="prob_qual.h" />
<ClInclude Include="rev_byte.h" />
+ <ClInclude Include="small_k_buf.h" />
<ClInclude Include="s_mapper.h" />
<ClInclude Include="params.h" />
<ClInclude Include="queues.h" />
@@ -203,7 +204,6 @@
<ItemGroup>
<ClCompile Include="bkb_reader.cpp" />
<ClCompile Include="bkb_writer.cpp" />
- <ClCompile Include="develop.cpp" />
<ClCompile Include="fastq_reader.cpp" />
<ClCompile Include="kb_completer.cpp" />
<ClCompile Include="kb_storer.cpp" />
@@ -211,6 +211,7 @@
<ClCompile Include="kmer_counter.cpp" />
<ClCompile Include="mem_disk_file.cpp" />
<ClCompile Include="mmer.cpp" />
+ <ClCompile Include="prob_qual.cpp" />
<ClCompile Include="radix.cpp" />
<ClCompile Include="rev_byte.cpp" />
<ClCompile Include="stdafx.cpp">
diff --git a/kmer_counter/kxmer_set.h b/kmer_counter/kxmer_set.h
index df0b800..30ca2fa 100644
--- a/kmer_counter/kxmer_set.h
+++ b/kmer_counter/kxmer_set.h
@@ -4,8 +4,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#ifndef _KXMER_SET_
#define _KXMER_SET_
diff --git a/kmer_counter/mem_disk_file.cpp b/kmer_counter/mem_disk_file.cpp
index 3ea4774..5e0d0fb 100644
--- a/kmer_counter/mem_disk_file.cpp
+++ b/kmer_counter/mem_disk_file.cpp
@@ -5,8 +5,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#include "mem_disk_file.h"
diff --git a/kmer_counter/mem_disk_file.h b/kmer_counter/mem_disk_file.h
index 9ee816d..9c64f73 100644
--- a/kmer_counter/mem_disk_file.h
+++ b/kmer_counter/mem_disk_file.h
@@ -4,8 +4,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#ifndef _MEM_DISK_FILE_H
diff --git a/kmer_counter/meta_oper.h b/kmer_counter/meta_oper.h
index 9d82a1d..af04896 100644
--- a/kmer_counter/meta_oper.h
+++ b/kmer_counter/meta_oper.h
@@ -4,8 +4,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#ifndef _META_OPER_H
diff --git a/kmer_counter/mmer.cpp b/kmer_counter/mmer.cpp
index 814467d..f1d8e12 100644
--- a/kmer_counter/mmer.cpp
+++ b/kmer_counter/mmer.cpp
@@ -5,8 +5,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#include "mmer.h"
diff --git a/kmer_counter/mmer.h b/kmer_counter/mmer.h
index b4e40c1..851e3b4 100644
--- a/kmer_counter/mmer.h
+++ b/kmer_counter/mmer.h
@@ -4,8 +4,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#ifndef _MMER_H
diff --git a/kmer_counter/params.h b/kmer_counter/params.h
index 77c6cab..3ae800b 100644
--- a/kmer_counter/params.h
+++ b/kmer_counter/params.h
@@ -4,8 +4,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#ifndef _PARAMS_H
@@ -33,8 +33,8 @@ struct CKMCParams {
int p_so; // no. of OpenMP threads for sorting
int p_sr; // no. of sorting threads
int p_ci; // do not count k-mers occurring less than
- int p_cx; // do not count k-mers occurring more than
- int p_cs; // maximal counter value
+ int64 p_cx; // do not count k-mers occurring more than
+ int64 p_cs; // maximal counter value
bool p_quake; // use Quake-compatibile counting
bool p_strict_mem; // use strict memory limit mode
bool p_mem_mode; // use RAM instead of disk
@@ -81,13 +81,18 @@ struct CKMCParams {
int64 mem_tot_pmm_epxand;
int64 mem_part_pmm_epxand;
+ int64 mem_part_small_k_buf;
+ int64 mem_tot_small_k_buf;
+ int64 mem_part_small_k_completer;
+ int64 mem_tot_small_k_completer;
+
bool verbose;
int kmer_len; // kmer length
int signature_len;
int cutoff_min; // exclude k-mers occurring less than times
- int cutoff_max; // exclude k-mers occurring more than times
- int counter_max; // maximal counter value
+ int64 cutoff_max; // exclude k-mers occurring more than times
+ int64 counter_max; // maximal counter value
bool use_quake; // use Quake's counting based on qualities
bool use_strict_mem; // use strict memory limit mode
int lowest_quality; // lowest quality value
@@ -180,6 +185,7 @@ struct CKMCQueues
CKmerQueue *kq;
CMemoryPool *pmm_bins, *pmm_fastq, *pmm_reads, *pmm_radix_buf, *pmm_prob, *pmm_stats, *pmm_expand;
CMemoryBins *memory_bins;
+ CMemoryPool* pmm_small_k_buf, *pmm_small_k_completer;
CDiskLogger* disk_logger;
diff --git a/kmer_counter/prob_qual.cpp b/kmer_counter/prob_qual.cpp
new file mode 100644
index 0000000..28eb2ba
--- /dev/null
+++ b/kmer_counter/prob_qual.cpp
@@ -0,0 +1,67 @@
+/*
+This file is a part of KMC software distributed under GNU GPL 3 licence.
+The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
+
+Version: 2.3.0
+Date : 2015-08-21
+*/
+#include "stdafx.h"
+#include "prob_qual.h"
+
+// K-mers with probability less than MIN_PROB_QUAL_VALUE will not be counted
+double CProbQual::MIN_PROB_QUAL_VALUE = 0.0000;
+double CProbQual::prob_qual[94] = {
+ 0.2500000000000000, 0.2500000000000000, 0.3690426555198070, 0.4988127663727280,
+ 0.6018928294465030, 0.6837722339831620, 0.7488113568490420, 0.8004737685031120,
+ 0.8415106807538890, 0.8741074588205830, 0.9000000000000000, 0.9205671765275720,
+ 0.9369042655519810, 0.9498812766372730, 0.9601892829446500, 0.9683772233983160,
+ 0.9748811356849040, 0.9800473768503110, 0.9841510680753890, 0.9874107458820580,
+ 0.9900000000000000, 0.9920567176527570, 0.9936904265551980, 0.9949881276637270,
+ 0.9960189282944650, 0.9968377223398320, 0.9974881135684900, 0.9980047376850310,
+ 0.9984151068075390, 0.9987410745882060, 0.9990000000000000, 0.9992056717652760,
+ 0.9993690426555200, 0.9994988127663730, 0.9996018928294460, 0.9996837722339830,
+ 0.9997488113568490, 0.9998004737685030, 0.9998415106807540, 0.9998741074588210,
+ 0.9999000000000000, 0.9999205671765280, 0.9999369042655520, 0.9999498812766370,
+ 0.9999601892829450, 0.9999683772233980, 0.9999748811356850, 0.9999800473768500,
+ 0.9999841510680750, 0.9999874107458820, 0.9999900000000000, 0.9999920567176530,
+ 0.9999936904265550, 0.9999949881276640, 0.9999960189282940, 0.9999968377223400,
+ 0.9999974881135680, 0.9999980047376850, 0.9999984151068080, 0.9999987410745880,
+ 0.9999990000000000, 0.9999992056717650, 0.9999993690426560, 0.9999994988127660,
+ 0.9999996018928290, 0.9999996837722340, 0.9999997488113570, 0.9999998004737680,
+ 0.9999998415106810, 0.9999998741074590, 0.9999999000000000, 0.9999999205671770,
+ 0.9999999369042660, 0.9999999498812770, 0.9999999601892830, 0.9999999683772230,
+ 0.9999999748811360, 0.9999999800473770, 0.9999999841510680, 0.9999999874107460,
+ 0.9999999900000000, 0.9999999920567180, 0.9999999936904270, 0.9999999949881280,
+ 0.9999999960189280, 0.9999999968377220, 0.9999999974881140, 0.9999999980047380,
+ 0.9999999984151070, 0.9999999987410750, 0.9999999990000000, 0.9999999992056720,
+ 0.9999999993690430, 0.9999999994988130 };
+
+double CProbQual::inv_prob_qual[94] = {
+ 4.0000000000000000, 4.0000000000000000, 2.7097138638119600, 2.0047602375372500,
+ 1.6614253419825500, 1.4624752955742600, 1.3354498310601800, 1.2492601748462100,
+ 1.1883390465158700, 1.1440241012807300, 1.1111111111111100, 1.0862868300084900,
+ 1.0673449110735400, 1.0527631448218000, 1.0414613220148200, 1.0326554320337200,
+ 1.0257660789563300, 1.0203588353185700, 1.0161041657513100, 1.0127497641386300,
+ 1.0101010101010100, 1.0080068832818700, 1.0063496369454600, 1.0050371177272600,
+ 1.0039969839853900, 1.0031723093832600, 1.0025182118938000, 1.0019992513458400,
+ 1.0015874090662800, 1.0012605123027600, 1.0010010010010000, 1.0007949596936500,
+ 1.0006313557030000, 1.0005014385482300, 1.0003982657229900, 1.0003163277976500,
+ 1.0002512517547400, 1.0001995660501600, 1.0001585144420900, 1.0001259083921100,
+ 1.0001000100010000, 1.0000794391335500, 1.0000630997157700, 1.0000501212353700,
+ 1.0000398123020100, 1.0000316237766300, 1.0000251194952900, 1.0000199530212600,
+ 1.0000158491831200, 1.0000125894126100, 1.0000100001000000, 1.0000079433454400,
+ 1.0000063096132600, 1.0000050118974600, 1.0000039810875500, 1.0000031622876600,
+ 1.0000025118927400, 1.0000019952663000, 1.0000015848957000, 1.0000012589270000,
+ 1.0000010000010000, 1.0000007943288700, 1.0000006309577400, 1.0000005011874800,
+ 1.0000003981073300, 1.0000003162278700, 1.0000002511887100, 1.0000001995262700,
+ 1.0000001584893400, 1.0000001258925600, 1.0000001000000100, 1.0000000794328300,
+ 1.0000000630957400, 1.0000000501187300, 1.0000000398107200, 1.0000000316227800,
+ 1.0000000251188600, 1.0000000199526200, 1.0000000158489300, 1.0000000125892500,
+ 1.0000000100000000, 1.0000000079432800, 1.0000000063095700, 1.0000000050118700,
+ 1.0000000039810700, 1.0000000031622800, 1.0000000025118900, 1.0000000019952600,
+ 1.0000000015848900, 1.0000000012589300, 1.0000000010000000, 1.0000000007943300,
+ 1.0000000006309600, 1.0000000005011900 };
+
+
diff --git a/kmer_counter/prob_qual.h b/kmer_counter/prob_qual.h
new file mode 100644
index 0000000..7280cf6
--- /dev/null
+++ b/kmer_counter/prob_qual.h
@@ -0,0 +1,20 @@
+/*
+This file is a part of KMC software distributed under GNU GPL 3 licence.
+The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
+
+Version: 2.3.0
+Date : 2015-08-21
+*/
+
+#ifndef _PROB_QUAL_H
+#define _PROB_QUAL_H
+
+struct CProbQual
+{
+ static double prob_qual[94];
+ static double inv_prob_qual[94];
+ static double MIN_PROB_QUAL_VALUE;
+};
+#endif
\ No newline at end of file
diff --git a/kmer_counter/queues.h b/kmer_counter/queues.h
index b3b9611..e96013d 100644
--- a/kmer_counter/queues.h
+++ b/kmer_counter/queues.h
@@ -4,8 +4,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#ifndef _QUEUES_H
@@ -650,6 +650,14 @@ public:
part = (double*) (buffer + stack[--n_parts_free]*part_size);
}
+ // Allocate memory buffer - float*
+ void reserve(float* &part)
+ {
+ unique_lock<mutex> lck(mtx);
+ cv.wait(lck, [this]{return n_parts_free > 0; });
+
+ part = (float*)(buffer + stack[--n_parts_free] * part_size);
+ }
// Deallocate memory buffer - uchar*
void free(uchar* part)
@@ -692,6 +700,14 @@ public:
stack[n_parts_free++] = (uint32) ((((uchar *) part) - buffer) / part_size);
cv.notify_all();
}
+ // Deallocate memory buffer - float*
+ void free(float* part)
+ {
+ lock_guard<mutex> lck(mtx);
+
+ stack[n_parts_free++] = (uint32)((((uchar *)part) - buffer) / part_size);
+ cv.notify_all();
+ }
};
diff --git a/kmer_counter/radix.cpp b/kmer_counter/radix.cpp
index b2c07a2..8a2f5c3 100644
--- a/kmer_counter/radix.cpp
+++ b/kmer_counter/radix.cpp
@@ -5,8 +5,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#include <stdio.h>
diff --git a/kmer_counter/radix.h b/kmer_counter/radix.h
index 62efcad..fb945a2 100644
--- a/kmer_counter/radix.h
+++ b/kmer_counter/radix.h
@@ -4,8 +4,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#ifndef _RADIX_H
#define _RADIX_H
diff --git a/kmer_counter/rev_byte.cpp b/kmer_counter/rev_byte.cpp
index 66ec8c0..148cee0 100644
--- a/kmer_counter/rev_byte.cpp
+++ b/kmer_counter/rev_byte.cpp
@@ -5,8 +5,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#include "rev_byte.h"
diff --git a/kmer_counter/rev_byte.h b/kmer_counter/rev_byte.h
index ecf481f..c875d32 100644
--- a/kmer_counter/rev_byte.h
+++ b/kmer_counter/rev_byte.h
@@ -4,8 +4,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#ifndef _REV_BYTE_H
diff --git a/kmer_counter/s_mapper.h b/kmer_counter/s_mapper.h
index ca444c8..9eb2a53 100644
--- a/kmer_counter/s_mapper.h
+++ b/kmer_counter/s_mapper.h
@@ -4,8 +4,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#ifndef _S_MAPPER_H
diff --git a/kmer_counter/small_k_buf.h b/kmer_counter/small_k_buf.h
new file mode 100644
index 0000000..54cfc39
--- /dev/null
+++ b/kmer_counter/small_k_buf.h
@@ -0,0 +1,43 @@
+/*
+ This file is a part of KMC software distributed under GNU GPL 3 licence.
+ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+ Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
+
+ Version: 2.3.0
+ Date : 2015-08-21
+*/
+
+#ifndef _SMALL_K_BUF
+#define _SMALL_K_BUF
+
+#include "defs.h"
+
+template<typename COUNTER_TYPE>
+struct CSmallKBuf
+{
+ COUNTER_TYPE* buf;
+ void Store(uint64 index, uchar* _buf, uint32& buf_pos, uint64 counter_size)
+ {
+ for (uint64 j = 0; j < counter_size; ++j)
+ _buf[buf_pos++] = (buf[index] >> (j * 8)) & 0xFF;
+ }
+};
+
+template<>
+struct CSmallKBuf<float>
+{
+ float* buf;
+ void Store(uint64 index, uchar* _buf, uint32& buf_pos, uint64 counter_size)//counter_size should be always 4 here
+ {
+ uint32 c;
+ memcpy(&c, &buf[index], 4);
+ for (int32 j = 0; j < 4; ++j)
+ _buf[buf_pos++] = (c >> (j * 8)) & 0xFF;
+ }
+};
+
+
+
+
+#endif
\ No newline at end of file
diff --git a/kmer_counter/splitter.h b/kmer_counter/splitter.h
index e81582d..f673102 100644
--- a/kmer_counter/splitter.h
+++ b/kmer_counter/splitter.h
@@ -4,8 +4,8 @@
Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#ifndef _SPLITTER_H
@@ -15,6 +15,7 @@
#include "kmer.h"
#include "kb_storer.h"
#include "kb_collector.h"
+#include "prob_qual.h"
#include "kb_reader.h"
#include "kb_sorter.h"
#include "kb_completer.h"
@@ -24,8 +25,9 @@
#include <stdio.h>
#include <iostream>
#include <vector>
-using namespace std;
+#include "small_k_buf.h"
+using namespace std;
//************************************************************************************************************
//************************************************************************************************************
@@ -36,7 +38,7 @@ template <bool QUAKE_MODE> class CSplitter_Impl;
//************************************************************************************************************
template <bool QUAKE_MODE> class CSplitter {
CMemoryMonitor *mm;
-
+ uint64 total_kmers = 0;
//CExKmer ex_kmer;
uchar *part;
uint64 part_size, part_pos;
@@ -64,8 +66,6 @@ template <bool QUAKE_MODE> class CSplitter {
inline bool GetSeq(char *seq, uint32 &seq_size);
inline bool GetSeq(char *seq, char *quals, uint32 &seq_size);
-
-
friend class CSplitter_Impl<QUAKE_MODE>;
public:
@@ -75,12 +75,19 @@ public:
CSplitter(CKMCParams &Params, CKMCQueues &Queues);
void InitBins(CKMCParams &Params, CKMCQueues &Queues);
- ~CSplitter();
-
+
bool ProcessReads(uchar *_part, uint64 _part_size);
+
+ template<typename COUNTER_TYPE>
+ bool ProcessReadsSmallK(uchar *_part, uint64 _part_size, CSmallKBuf<COUNTER_TYPE>& small_k_buf);
+
void Complete();
void GetTotal(uint64 &_n_reads);
+
+ uint64 GetTotalKmers();
+
+ ~CSplitter();
};
template <bool QUAKE_MODE> uint32 CSplitter<QUAKE_MODE>::MAX_LINE_SIZE = 1 << 14;
@@ -92,16 +99,21 @@ template <bool QUAKE_MODE> uint32 CSplitter<QUAKE_MODE>::MAX_LINE_SIZE = 1 << 14
template <bool QUAKE_MODE> class CSplitter_Impl {
public:
static bool ProcessReads(CSplitter<QUAKE_MODE> &ptr, uchar *_part, uint64 _part_size);
+ template<typename COUNTER_TYPE>
+ static bool ProcessReadsSmallK(CSplitter<QUAKE_MODE> &ptr, uchar *_part, uint64 _part_size, CSmallKBuf<COUNTER_TYPE>& small_k_buf);
};
template <> class CSplitter_Impl<false> {
public:
static bool ProcessReads(CSplitter<false> &ptr, uchar *_part, uint64 _part_size);
+ template<typename COUNTER_TYPE>
+ static bool ProcessReadsSmallK(CSplitter<false> &ptr, uchar *_part, uint64 _part_size, CSmallKBuf<COUNTER_TYPE>& small_k_buf);
};
template <> class CSplitter_Impl<true> {
public:
- static bool ProcessReads(CSplitter<true> &ptr, uchar *_part, uint64 _part_size);
+ static bool ProcessReads(CSplitter<true> &ptr, uchar *_part, uint64 _part_size);
+ static bool ProcessReadsSmallK(CSplitter<true> &ptr, uchar *_part, uint64 _part_size, CSmallKBuf<float>& small_k_buf);
};
//----------------------------------------------------------------------------------
@@ -520,6 +532,15 @@ template <bool QUAKE_MODE> void CSplitter<QUAKE_MODE>::Complete()
}
//----------------------------------------------------------------------------------
+// Process the reads from the given FASTQ file part in small k optimization mode
+template<bool QUAKE_MODE>
+template<typename COUNTER_TYPE>
+bool CSplitter<QUAKE_MODE>::ProcessReadsSmallK(uchar *_part, uint64 _part_size, CSmallKBuf<COUNTER_TYPE>& small_k_buf)
+{
+ return CSplitter_Impl<QUAKE_MODE>::ProcessReadsSmallK(*this, _part, _part_size, small_k_buf);
+}
+
+//----------------------------------------------------------------------------------
// Process the reads from the given FASTQ file part
template <bool QUAKE_MODE> bool CSplitter<QUAKE_MODE>::ProcessReads(uchar *_part, uint64 _part_size)
{
@@ -533,12 +554,144 @@ template <bool QUAKE_MODE> void CSplitter<QUAKE_MODE>::GetTotal(uint64 &_n_reads
_n_reads = n_reads;
}
+//----------------------------------------------------------------------------------
+// Return the number of kmers processed by splitter (!!! only for small k optimization)
+template <bool QUAKE_MODE> uint64 CSplitter<QUAKE_MODE>::GetTotalKmers()
+{
+ return total_kmers;
+}
//************************************************************************************************************
// Implementation of specific splitter methods for various types and sizes of kmers
//************************************************************************************************************
//----------------------------------------------------------------------------------
+// Process the reads from the given FASTQ file part in small k optimization mode
+template<typename COUNTER_TYPE>
+bool CSplitter_Impl<false>::ProcessReadsSmallK(CSplitter<false> &ptr, uchar *_part, uint64 _part_size, CSmallKBuf<COUNTER_TYPE>& small_k_buf)
+{
+ ptr.part = _part;
+ ptr.part_size = _part_size;
+ ptr.part_pos = 0;
+
+ char *seq;
+ uint32 seq_size;
+ int omit_next_n_kmers;
+ CKmer<1> kmer_str, kmer_rev, kmer_can;
+ uint32 i;
+ CKmer<1> kmer_mask;
+ ptr.pmm_reads->reserve(seq);
+ kmer_mask.set_n_1(2 * ptr.kmer_len);
+
+ uint32 kmer_len_shift = (ptr.kmer_len - 1) * 2;
+
+ if (ptr.both_strands)
+ while (ptr.GetSeq(seq, seq_size))
+ {
+ if (ptr.file_type != multiline_fasta)
+ ptr.n_reads++;
+
+ // Init k-mer
+ kmer_str.clear();
+ kmer_rev.clear();
+
+ // Process first k-1 symbols of a read
+ uint32 str_pos = kmer_len_shift - 2;
+ uint32 rev_pos = 2;
+
+ omit_next_n_kmers = 0;
+
+ for (i = 0; i < ptr.kmer_len - 1; ++i, str_pos -= 2, rev_pos += 2)
+ {
+ if (seq[i] < 0)
+ {
+ seq[i] = 0;
+ omit_next_n_kmers = i + 1;
+ }
+ kmer_str.set_2bits(seq[i], str_pos);
+ kmer_rev.set_2bits(3 - seq[i], rev_pos);
+ }
+
+ // Process next part of a read
+ for (; i < seq_size; ++i)
+ {
+ if (seq[i] < 0) // N in a read
+ {
+ seq[i] = 0;
+ omit_next_n_kmers = ptr.kmer_len; // Mark how many symbols to ommit to get the next kmer without any N
+ }
+ kmer_str.SHL_insert_2bits(seq[i]);
+ kmer_str.mask(kmer_mask);
+ kmer_rev.SHR_insert_2bits(3 - seq[i], kmer_len_shift);
+
+ // If necessary ommit next symbols
+ if (omit_next_n_kmers > 0)
+ {
+ omit_next_n_kmers--;
+ continue;
+ }
+
+ // Find canonical kmer representation
+ kmer_can = (kmer_str < kmer_rev) ? kmer_str : kmer_rev;
+
+ ++small_k_buf.buf[kmer_can.data];
+ ++ptr.total_kmers;
+ }
+ }
+ else
+ while (ptr.GetSeq(seq, seq_size))
+ {
+ if (ptr.file_type != multiline_fasta)
+ ptr.n_reads++;
+
+ // Init k-mer
+ kmer_str.clear();
+
+ // Process first k-1 symbols of a read
+ uint32 str_pos = kmer_len_shift - 2;
+
+ omit_next_n_kmers = 0;
+
+ for (i = 0; i < ptr.kmer_len - 1; ++i, str_pos -= 2)
+ {
+ if (seq[i] < 0)
+ {
+ seq[i] = 0;
+ omit_next_n_kmers = i + 1;
+ }
+ kmer_str.set_2bits(seq[i], str_pos);
+ }
+
+ // Process next part of a read
+ for (; i < seq_size; ++i)
+ {
+ if (seq[i] < 0) // N in a read
+ {
+ seq[i] = 0;
+ omit_next_n_kmers = ptr.kmer_len; // Mark how many symbols to ommit to get the next kmer without any N
+ }
+ kmer_str.SHL_insert_2bits(seq[i]);
+ kmer_str.mask(kmer_mask);
+
+ // If necessary ommit next symbols
+ if (omit_next_n_kmers > 0)
+ {
+ omit_next_n_kmers--;
+ continue;
+ }
+
+ ++small_k_buf.buf[kmer_str.data];
+ ++ptr.total_kmers;
+ }
+ }
+ putchar('*');
+ fflush(stdout);
+
+ ptr.pmm_reads->free(seq);
+ return true;
+}
+
+//----------------------------------------------------------------------------------
// Process the reads from the given FASTQ file part
bool CSplitter_Impl<false>::ProcessReads(CSplitter<false> &ptr, uchar *_part, uint64 _part_size)
{
@@ -661,6 +814,164 @@ bool CSplitter_Impl<false>::ProcessReads(CSplitter<false> &ptr, uchar *_part, ui
return true;
}
+bool CSplitter_Impl<true>::ProcessReadsSmallK(CSplitter<true> &ptr, uchar *_part, uint64 _part_size, CSmallKBuf<float>& small_k_buf)
+{
+ ptr.part = _part;
+ ptr.part_size = _part_size;
+ ptr.part_pos = 0;
+
+ char *seq;
+ char *quals;
+ double *raw_inv_probs;
+
+ ptr.pmm_reads->reserve(seq);
+ ptr.pmm_reads->reserve(quals);
+ ptr.pmm_reads->reserve(raw_inv_probs);
+
+ double *inv_probs = raw_inv_probs + 1;
+ inv_probs[-1] = 1.0; // !!! Correct
+
+ uint32 seq_size;
+ int omit_next_n_kmers;
+ CKmer<1> kmer_str, kmer_rev, kmer_can;
+ double kmer_prob;
+
+ uint32 i;
+ CKmer<1> kmer_mask;
+
+ kmer_mask.set_n_1(2 * ptr.kmer_len);
+
+ uint32 kmer_len_shift = (ptr.kmer_len - 1) * 2;
+
+ if (ptr.both_strands)
+ while (ptr.GetSeq(seq, quals, seq_size))
+ {
+ ptr.n_reads++;
+
+ // Init k-mer
+ kmer_str.clear();
+ kmer_rev.clear();
+
+ // Process first k-1 symbols of a read
+ uint32 str_pos = kmer_len_shift - 2;
+ uint32 rev_pos = 2;
+
+ omit_next_n_kmers = 0;
+ kmer_prob = 1.0;
+
+ for (i = 0; i < ptr.kmer_len - 1; ++i, str_pos -= 2, rev_pos += 2)
+ {
+ if (seq[i] < 0)
+ {
+ seq[i] = 0;
+ omit_next_n_kmers = i + 1;
+
+ }
+ inv_probs[i] = CProbQual::inv_prob_qual[quals[i] - ptr.lowest_quality];
+
+ kmer_str.set_2bits(seq[i], str_pos);
+ kmer_rev.set_2bits(3 - seq[i], rev_pos);
+ kmer_prob *= CProbQual::prob_qual[quals[i] - ptr.lowest_quality];
+ }
+
+ // Process next part of a read
+ for (; i < seq_size; ++i)
+ {
+ if (seq[i] < 0) // N in a read
+ {
+ seq[i] = 0;
+ omit_next_n_kmers = ptr.kmer_len; // Mark how many symbols to ommit to get the next kmer without any N
+ }
+ inv_probs[i] = CProbQual::inv_prob_qual[quals[i] - ptr.lowest_quality];
+
+ kmer_str.SHL_insert_2bits(seq[i]);
+ kmer_str.mask(kmer_mask);
+ kmer_rev.SHR_insert_2bits(3 - seq[i], kmer_len_shift);
+ kmer_prob *= CProbQual::prob_qual[quals[i] - ptr.lowest_quality] * inv_probs[(int)i - (int)ptr.kmer_len];
+
+ // If necessary ommit next symbols
+ if (omit_next_n_kmers > 0)
+ {
+ omit_next_n_kmers--;
+ continue;
+ }
+
+ if (kmer_prob < CProbQual::MIN_PROB_QUAL_VALUE)
+ continue;
+
+ // Find canonical kmer representation
+ kmer_can = (kmer_str < kmer_rev) ? kmer_str : kmer_rev;
+ small_k_buf.buf[kmer_can.data] += static_cast<float>(kmer_prob);
+ ++ptr.total_kmers;
+ }
+ }
+ else
+ while (ptr.GetSeq(seq, quals, seq_size))
+ {
+ ptr.n_reads++;
+
+ // Init k-mer
+ kmer_str.clear();
+
+ // Process first k-1 symbols of a read
+ uint32 str_pos = kmer_len_shift - 2;
+
+ omit_next_n_kmers = 0;
+ kmer_prob = 1.0;
+
+ for (i = 0; i < ptr.kmer_len - 1; ++i, str_pos -= 2)
+ {
+ if (seq[i] < 0)
+ {
+ seq[i] = 0;
+ omit_next_n_kmers = i + 1;
+
+ }
+ inv_probs[i] = CProbQual::inv_prob_qual[quals[i] - ptr.lowest_quality];
+
+ kmer_str.set_2bits(seq[i], str_pos);
+ kmer_prob *= CProbQual::prob_qual[quals[i] - ptr.lowest_quality];
+ }
+
+ // Process next part of a read
+ for (; i < seq_size; ++i)
+ {
+ if (seq[i] < 0) // N in a read
+ {
+ seq[i] = 0;
+ omit_next_n_kmers = ptr.kmer_len; // Mark how many symbols to ommit to get the next kmer without any N
+ }
+ inv_probs[i] = CProbQual::inv_prob_qual[quals[i] - ptr.lowest_quality];
+
+ kmer_str.SHL_insert_2bits(seq[i]);
+ kmer_str.mask(kmer_mask);
+ kmer_prob *= CProbQual::prob_qual[quals[i] - ptr.lowest_quality] * inv_probs[(int)i - (int)ptr.kmer_len];
+
+ // If necessary ommit next symbols
+ if (omit_next_n_kmers > 0)
+ {
+ omit_next_n_kmers--;
+ continue;
+ }
+
+ if (kmer_prob < CProbQual::MIN_PROB_QUAL_VALUE)
+ continue;
+
+ small_k_buf.buf[kmer_str.data] += static_cast<float>(kmer_prob);
+ ++ptr.total_kmers;
+ }
+ }
+
+ putchar('*');
+ fflush(stdout);
+
+ ptr.pmm_reads->free(seq);
+ ptr.pmm_reads->free(quals);
+ ptr.pmm_reads->free(raw_inv_probs);
+
+ return true;
+}
+
//----------------------------------------------------------------------------------
// Process the reads from the given FASTQ file part
bool CSplitter_Impl<true>::ProcessReads(CSplitter<true> &ptr, uchar *_part, uint64 _part_size)
@@ -936,6 +1247,99 @@ template <bool QUAKE_MODE> void CWStatsSplitter<QUAKE_MODE>::GetStats(uint32* _s
}
+//************************************************************************************************************
+// CWSmallKSplitter class - wrapper for multithreading purposes
+//************************************************************************************************************
+//----------------------------------------------------------------------------------
+template <bool QUAKE_MODE, typename COUNTER_TYPE> class CWSmallKSplitter {
+ CPartQueue *pq;
+ CMemoryPool *pmm_fastq, *pmm_small_k;
+ CSmallKBuf<COUNTER_TYPE> small_k_buf;
+
+ CSplitter<QUAKE_MODE> *spl;
+ uint64 n_reads;
+ uint64 total_kmers;
+ uint32 kmer_len;
+
+public:
+ CWSmallKSplitter(CKMCParams &Params, CKMCQueues &Queues);
+ ~CWSmallKSplitter();
+
+ void operator()();
+ void GetTotal(uint64 &_n_reads);
+
+ CSmallKBuf<COUNTER_TYPE> GetResult()
+ {
+ return small_k_buf;
+ }
+
+ uint64 GetTotalKmers()
+ {
+ if (spl)
+ return spl->GetTotalKmers();
+ return total_kmers;
+ }
+
+ void Release()
+ {
+ pmm_small_k->free(small_k_buf.buf);
+ }
+};
+
+//----------------------------------------------------------------------------------
+// Constructor
+template <bool QUAKE_MODE, typename COUNTER_TYPE> CWSmallKSplitter<QUAKE_MODE, COUNTER_TYPE>::CWSmallKSplitter(CKMCParams &Params, CKMCQueues &Queues)
+{
+ pq = Queues.part_queue;
+ pmm_fastq = Queues.pmm_fastq;
+ pmm_small_k = Queues.pmm_small_k_buf;
+ kmer_len = Params.kmer_len;
+ spl = new CSplitter<QUAKE_MODE>(Params, Queues);
+}
+
+//----------------------------------------------------------------------------------
+// Destructor
+template <bool QUAKE_MODE, typename COUNTER_TYPE> CWSmallKSplitter<QUAKE_MODE, COUNTER_TYPE>::~CWSmallKSplitter()
+{
+}
+
+//----------------------------------------------------------------------------------
+// Execution
+template <bool QUAKE_MODE, typename COUNTER_TYPE> void CWSmallKSplitter<QUAKE_MODE, COUNTER_TYPE>::operator()()
+{
+ pmm_small_k->reserve(small_k_buf.buf);
+ memset(small_k_buf.buf, 0, (1ull << 2 * kmer_len) * sizeof(*small_k_buf.buf));
+
+ // Splitting parts
+ while (!pq->completed())
+ {
+ uchar *part;
+ uint64 size;
+ if (pq->pop(part, size))
+ {
+ spl->ProcessReadsSmallK(part, size, small_k_buf);
+ pmm_fastq->free(part);
+ }
+ }
+ spl->Complete();
+
+ spl->GetTotal(n_reads);
+ total_kmers = spl->GetTotalKmers();
+ delete spl;
+ spl = NULL;
+}
+
+//----------------------------------------------------------------------------------
+// Return statistics
+template <bool QUAKE_MODE, typename COUNTER_TYPE> void CWSmallKSplitter<QUAKE_MODE, COUNTER_TYPE>::GetTotal(uint64 &_n_reads)
+{
+ if (spl)
+ spl->GetTotal(n_reads);
+
+ _n_reads = n_reads;
+}
+
+
#endif
// ***** EOF
diff --git a/kmer_counter/timer.cpp b/kmer_counter/timer.cpp
index d9dfbff..5882519 100644
--- a/kmer_counter/timer.cpp
+++ b/kmer_counter/timer.cpp
@@ -6,8 +6,8 @@
The source codes are based on codes written by Dennis and published:
http://allmybrain.com/2008/06/10/timing-cc-code-on-linux/
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#ifdef WIN32
diff --git a/kmer_counter/timer.h b/kmer_counter/timer.h
index 784fb25..84b4fba 100644
--- a/kmer_counter/timer.h
+++ b/kmer_counter/timer.h
@@ -5,8 +5,8 @@
The source codes are based on codes written by Dennis and published:
http://allmybrain.com/2008/06/10/timing-cc-code-on-linux/
- Version: 2.2.0
- Date : 2015-04-15
+ Version: 2.3.0
+ Date : 2015-08-21
*/
#ifndef _TIMER_H
diff --git a/makefile b/makefile
index 8488f72..140e33c 100644
--- a/makefile
+++ b/makefile
@@ -4,11 +4,15 @@ KMC_BIN_DIR = bin
KMC_MAIN_DIR = kmer_counter
KMC_API_DIR = kmc_api
KMC_DUMP_DIR = kmc_dump
+KMC_TOOLS_DIR = kmc_tools
CC = g++
CFLAGS = -Wall -O3 -m64 -static -fopenmp -Wl,--whole-archive -lpthread -Wl,--no-whole-archive -std=c++11
CLINK = -lm -static -fopenmp -O3 -Wl,--whole-archive -lpthread -Wl,--no-whole-archive -std=c++11
+KMC_TOOLS_CFLAGS = -Wall -O3 -m64 -static -Wl,--whole-archive -lpthread -Wl,--no-whole-archive -std=c++14
+KMC_TOOLS_CLINK = -lm -static -O3 -Wl,--whole-archive -lpthread -Wl,--no-whole-archive -std=c++14
+
DISABLE_ASMLIB = false
KMC_OBJS = \
@@ -23,7 +27,8 @@ $(KMC_MAIN_DIR)/timer.o \
$(KMC_MAIN_DIR)/radix.o \
$(KMC_MAIN_DIR)/kb_completer.o \
$(KMC_MAIN_DIR)/kb_storer.o \
-$(KMC_MAIN_DIR)/kmer.o
+$(KMC_MAIN_DIR)/kmer.o \
+$(KMC_MAIN_DIR)/prob_qual.o
KMC_LIBS = \
$(KMC_MAIN_DIR)/libs/libz.a \
@@ -32,33 +37,62 @@ $(KMC_MAIN_DIR)/libs/libbz2.a
KMC_DUMP_OBJS = \
$(KMC_DUMP_DIR)/nc_utils.o \
$(KMC_API_DIR)/mmer.o \
-$(KMC_DUMP_DIR)/kmc_dump.o \
+$(KMC_DUMP_DIR)/kmc_dump.o
+
+KMC_API_OBJS = \
+$(KMC_API_DIR)/mmer.o \
$(KMC_API_DIR)/kmc_file.o \
$(KMC_API_DIR)/kmer_api.o
+KMC_TOOLS_OBJS = \
+$(KMC_TOOLS_DIR)/kmc_header.o \
+$(KMC_TOOLS_DIR)/kmc_tools.o \
+$(KMC_TOOLS_DIR)/nc_utils.o \
+$(KMC_TOOLS_DIR)/parameters_parser.o \
+$(KMC_TOOLS_DIR)/parser.o \
+$(KMC_TOOLS_DIR)/tokenizer.o \
+$(KMC_TOOLS_DIR)/fastq_filter.o \
+$(KMC_TOOLS_DIR)/fastq_reader.o \
+$(KMC_TOOLS_DIR)/fastq_writer.o \
+$(KMC_TOOLS_DIR)/percent_progress.o
+KMC_TOOLS_LIBS = \
+$(KMC_TOOLS_DIR)/libs/libz.a \
+$(KMC_TOOLS_DIR)/libs/libbz2.a
ifeq ($(DISABLE_ASMLIB),true)
CFLAGS += -DDISABLE_ASMLIB
+ KMC_TOOLS_CFLAGS += -DDISABLE_ASMLIB
else
KMC_LIBS += \
$(KMC_MAIN_DIR)/libs/alibelf64.a
+ KMC_TOOLS_LIBS += \
+ $(KMC_TOOLS_DIR)/libs/alibelf64.a
endif
-
-.cpp.o:
+$(KMC_OBJS) $(KMC_DUMP_OBJS) $(KMC_API_OBJS): %.o: %.cpp
$(CC) $(CFLAGS) -c $< -o $@
+$(KMC_TOOLS_OBJS): %.o: %.cpp
+ $(CC) $(KMC_TOOLS_CFLAGS) -c $< -o $@
+
+
kmc: $(KMC_OBJS)
-mkdir -p $(KMC_BIN_DIR)
$(CC) $(CLINK) -o $(KMC_BIN_DIR)/$@ $^ $(KMC_LIBS)
-kmc_dump: $(KMC_DUMP_OBJS)
+kmc_dump: $(KMC_DUMP_OBJS) $(KMC_API_OBJS)
-mkdir -p $(KMC_BIN_DIR)
$(CC) $(CLINK) -o $(KMC_BIN_DIR)/$@ $^
+
+kmc_tools: $(KMC_TOOLS_OBJS) $(KMC_API_OBJS)
+ -mkdir -p $(KMC_BIN_DIR)
+ $(CC) $(KMC_TOOLS_CLINK) -o $(KMC_BIN_DIR)/$@ $^ $(KMC_TOOLS_LIBS)
+
clean:
-rm $(KMC_MAIN_DIR)/*.o
-rm $(KMC_API_DIR)/*.o
-rm $(KMC_DUMP_DIR)/*.o
+ -rm $(KMC_TOOLS_DIR)/*.o
-rm -rf bin
-all: kmc kmc_dump
+all: kmc kmc_dump kmc_tools
\ No newline at end of file
diff --git a/makefile_mac b/makefile_mac
index 99f31b9..a523acf 100644
--- a/makefile_mac
+++ b/makefile_mac
@@ -1,13 +1,16 @@
-all: kmc
+all: kmc kmc_dump kmc_tools
KMC_BIN_DIR = bin
KMC_MAIN_DIR = kmer_counter
KMC_API_DIR = kmc_api
KMC_DUMP_DIR = kmc_dump
+KMC_TOOLS_DIR = kmc_tools
CC = /usr/local/Cellar/gcc49/4.9.2/bin/g++-4.9
CFLAGS = -Wall -O3 -m64 -static-libgcc -static-libstdc++ -fopenmp -pthread -std=c++11
CLINK = -lm -fopenmp -static-libgcc -static-libstdc++ -O3 -pthread -std=c++11
+KMC_TOOLS_CFLAGS = -Wall -O3 -m64 -static-libgcc -static-libstdc++ -pthread -std=c++14
+KMC_TOOLS_CLINK = -lm -static-libgcc -static-libstdc++ -O3 -pthread -std=c++14
DISABLE_ASMLIB = false
@@ -23,7 +26,8 @@ $(KMC_MAIN_DIR)/timer.o \
$(KMC_MAIN_DIR)/radix.o \
$(KMC_MAIN_DIR)/kb_completer.o \
$(KMC_MAIN_DIR)/kb_storer.o \
-$(KMC_MAIN_DIR)/kmer.o
+$(KMC_MAIN_DIR)/kmer.o \
+$(KMC_MAIN_DIR)/prob_qual.o
KMC_LIBS = \
$(KMC_MAIN_DIR)/libs/libz.1.2.5.dylib \
@@ -32,33 +36,62 @@ $(KMC_MAIN_DIR)/libs/libbz2.1.0.5.dylib
KMC_DUMP_OBJS = \
$(KMC_DUMP_DIR)/nc_utils.o \
$(KMC_API_DIR)/mmer.o \
-$(KMC_DUMP_DIR)/kmc_dump.o \
+$(KMC_DUMP_DIR)/kmc_dump.o
+
+KMC_API_OBJS = \
+$(KMC_API_DIR)/mmer.o \
$(KMC_API_DIR)/kmc_file.o \
$(KMC_API_DIR)/kmer_api.o
+KMC_TOOLS_OBJS = \
+$(KMC_TOOLS_DIR)/kmc_header.o \
+$(KMC_TOOLS_DIR)/kmc_tools.o \
+$(KMC_TOOLS_DIR)/nc_utils.o \
+$(KMC_TOOLS_DIR)/parameters_parser.o \
+$(KMC_TOOLS_DIR)/parser.o \
+$(KMC_TOOLS_DIR)/tokenizer.o \
+$(KMC_TOOLS_DIR)/fastq_filter.o \
+$(KMC_TOOLS_DIR)/fastq_reader.o \
+$(KMC_TOOLS_DIR)/fastq_writer.o \
+$(KMC_TOOLS_DIR)/percent_progress.o
+KMC_TOOLS_LIBS = \
+$(KMC_TOOLS_DIR)/libs/libz.1.2.5.dylib \
+$(KMC_TOOLS_DIR)/libs/libbz2.1.0.5.dylib
ifeq ($(DISABLE_ASMLIB),true)
CFLAGS += -DDISABLE_ASMLIB
+ KMC_TOOLS_CFLAGS += -DDISABLE_ASMLIB
else
KMC_LIBS += \
$(KMC_MAIN_DIR)/libs/libamac64.a
+ KMC_TOOLS_LIBS += \
+ $(KMC_TOOLS_DIR)/libs/libamac64.a
endif
-
-.cpp.o:
+$(KMC_OBJS) $(KMC_DUMP_OBJS) $(KMC_API_OBJS): %.o: %.cpp
$(CC) $(CFLAGS) -c $< -o $@
+$(KMC_TOOLS_OBJS): %.o: %.cpp
+ $(CC) $(KMC_TOOLS_CFLAGS) -c $< -o $@
+
+
+
kmc: $(KMC_OBJS)
-mkdir -p $(KMC_BIN_DIR)
$(CC) $(CLINK) -o $(KMC_BIN_DIR)/$@ $^ $(KMC_LIBS)
-kmc_dump: $(KMC_DUMP_OBJS)
+kmc_dump: $(KMC_DUMP_OBJS) $(KMC_API_OBJS)
-mkdir -p $(KMC_BIN_DIR)
$(CC) $(CLINK) -o $(KMC_BIN_DIR)/$@ $^
+kmc_tools: $(KMC_TOOLS_OBJS) $(KMC_API_OBJS)
+ -mkdir -p $(KMC_BIN_DIR)
+ $(CC) $(KMC_TOOLS_CLINK) -o $(KMC_BIN_DIR)/$@ $^ $(KMC_TOOLS_LIBS)
+
clean:
-rm $(KMC_MAIN_DIR)/*.o
-rm $(KMC_API_DIR)/*.o
-rm $(KMC_DUMP_DIR)/*.o
+ -rm $(KMC_TOOLS_DIR)/*.o
-rm -rf bin
-all: kmc kmc_dump
+all: kmc kmc_dump kmc_tools
\ No newline at end of file
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/kmc.git
More information about the debian-med-commit
mailing list