[med-svn] [spades] 01/06: Imported Upstream version 3.9.0+dfsg
Sascha Steinbiss
satta at debian.org
Sun Jul 24 13:35:41 UTC 2016
This is an automated email from the git hooks/post-receive script.
satta pushed a commit to branch master
in repository spades.
commit 4fe3eea860fa553ab3649c489fa725d1d3ad4041
Author: Sascha Steinbiss <satta at debian.org>
Date: Sun Jul 24 09:47:21 2016 +0000
Imported Upstream version 3.9.0+dfsg
---
VERSION | 3 +-
changelog.html | 8 +
configs/debruijn/config.info | 12 +-
configs/debruijn/detail_info_printer.info | 2 +
configs/debruijn/log.properties | 2 +
configs/debruijn/pe_params.info | 12 +-
configs/debruijn/rna_mode.info | 50 +-
configs/debruijn/simplification.info | 13 +
configs/debruijn/simplification.info.template | 0
ext/include/cxxopts/cxxopts.hpp | 1368 ++++++++++++++++++++
ext/include/llvm/Support/MathExtras.h | 1 -
ext/src/bamtools/api/internal/io/pbgzf/bgzf.c | 4 +-
ext/src/bamtools/api/internal/io/pbgzf/bgzf.h | 4 +-
ext/src/llvm/CMakeLists.txt | 3 -
manual.html | 80 +-
metaspades.py | 57 +-
plasmidspades.py | 57 +-
metaspades.py => rnaspades.py | 57 +-
rnaspades_manual.html | 84 ++
spades.py | 57 +-
src/CMakeLists.txt | 9 +-
src/cmake/pack.cmake | 6 +-
.../algorithms/genome_consistance_checker.cpp | 2 +-
src/modules/algorithms/graph_construction.hpp | 9 +-
src/modules/algorithms/mismatch_shall_not_pass.hpp | 15 +-
.../algorithms/path_extend/extension_chooser.hpp | 60 +-
.../algorithms/path_extend/overlap_analysis.hpp | 14 +-
.../algorithms/path_extend/path_extend_launch.hpp | 862 +++++++-----
.../algorithms/path_extend/path_extender.hpp | 223 +++-
src/modules/algorithms/path_extend/path_filter.hpp | 50 +-
.../algorithms/path_extend/pe_config_struct.cpp | 67 +-
.../algorithms/path_extend/pe_config_struct.hpp | 365 +++---
src/modules/algorithms/path_extend/pe_io.hpp | 13 +-
src/modules/algorithms/path_extend/pe_resolver.hpp | 7 +-
.../scaffolder2015/extension_chooser2015.hpp | 22 +-
.../algorithms/path_extend/utils/CMakeLists.txt | 13 -
.../algorithms/path_extend/utils/find_aligns.py | 67 -
.../path_extend/utils/find_single_threshold.py | 101 --
.../path_extend/utils/paired_info_checker.cpp | 204 ---
.../path_extend/utils/run_all_parametrs.py | 47 -
.../erroneous_connection_remover.hpp | 123 ++
.../algorithms/simplification/tip_clipper.hpp | 12 +-
src/modules/assembly_graph/CMakeLists.txt | 2 +-
.../assembly_graph/graph_alignment/edge_index.hpp | 41 +-
.../graph_alignment/edge_index_refiller.cpp | 33 +
.../graph_alignment/edge_index_refiller.hpp | 20 +
.../assembly_graph/graph_alignment/kmer_mapper.hpp | 5 +
.../graph_alignment/pacbio/pac_index.hpp | 82 +-
.../graph_alignment/pacbio/pacbio_gap_closer.hpp | 20 +-
.../pacbio/pacbio_read_structures.hpp | 12 +-
.../graph_alignment/sequence_mapper.hpp | 69 +-
.../graph_alignment/sequence_mapper_notifier.hpp | 3 +
.../graph_alignment/short_read_mapper.hpp | 2 +-
.../graph_support/basic_edge_conditions.hpp | 1 +
.../assembly_graph/graph_support/contig_output.hpp | 22 +-
.../graph_support/genomic_quality.hpp | 6 +-
.../handlers/edges_position_handler.hpp | 6 +
src/modules/assembly_graph/paths/mapping_path.hpp | 5 +
src/modules/assembly_graph/stats/picture_dump.hpp | 21 +
.../debruijn_graph/debruijn_graph_constructor.hpp | 7 -
.../debruijn_graph/early_simplification.hpp | 81 +-
.../indices/edge_index_builders.hpp | 33 +-
.../data_structures/indices/edge_info_updater.hpp | 1 +
.../data_structures/indices/edge_multi_index.hpp | 8 +-
.../indices/edge_position_index.hpp | 29 +-
.../data_structures/indices/key_with_hash.hpp | 38 +-
.../indices/kmer_extension_index.hpp | 108 +-
.../indices/kmer_extension_index_builder.hpp | 106 ++
.../data_structures/indices/kmer_splitters.hpp | 249 +---
.../data_structures/indices/perfect_hash_map.hpp | 112 +-
.../indices/perfect_hash_map_builder.hpp | 102 ++
.../data_structures/mph_index/kmer_index.hpp | 403 +-----
.../{kmer_index.hpp => kmer_index_builder.hpp} | 394 ++----
.../mph_index/kmer_index_traits.hpp | 87 ++
src/modules/data_structures/mph_index/mphf.hpp | 2 +-
src/modules/data_structures/sequence/rtseq.hpp | 16 +-
src/modules/data_structures/sequence/sequence.hpp | 11 +
.../data_structures/sequence/simple_seq.hpp | 5 +-
src/modules/io/dataset_support/read_converter.hpp | 403 +++---
.../io/reads_io/modifying_reader_wrapper.hpp | 1 +
src/modules/io/reads_io/mpmc_bounded.hpp | 27 +-
src/modules/io/reads_io/paired_readers.hpp | 1 +
src/modules/io/reads_io/read_processor.hpp | 62 +-
src/modules/io/reads_io/splitting_wrapper.hpp | 1 +
src/modules/math/kmer_coverage_model.cpp | 4 +-
src/modules/paired_info/pair_info_improver.hpp | 6 +-
src/modules/pipeline/config_struct.cpp | 47 +-
src/modules/pipeline/config_struct.hpp | 46 +-
src/modules/pipeline/genomic_info_filler.cpp | 2 +-
src/modules/pipeline/graph_pack.hpp | 8 +-
src/modules/pipeline/library.hpp | 5 +-
src/modules/stages/construction.cpp | 9 +-
src/modules/stages/simplification.cpp | 87 +-
.../graph_simplification.hpp | 258 ++--
.../simplification_settings.hpp | 11 +-
src/modules/visualization/position_filler.hpp | 4 +-
src/modules/visualization/visualization_utils.hpp | 12 +-
src/projects/CMakeLists.txt | 11 +-
src/projects/cap/cap_kmer_index.hpp | 11 -
src/projects/cap/mosaic.hpp | 2 +-
src/projects/hammer/expander.cpp | 8 +-
src/projects/hammer/expander.hpp | 3 +-
src/projects/hammer/kmer_cluster.cpp | 2 +
src/projects/hammer/kmer_data.cpp | 495 +++----
src/projects/ionhammer/hamcluster.hpp | 1 -
src/projects/ionhammer/kmer_data.cpp | 121 +-
src/projects/ionhammer/read_corrector.hpp | 56 +-
src/projects/mph_test/CMakeLists.txt | 15 +
src/projects/mph_test/main.cpp | 184 +++
.../online_vis/debruijn_online_visualizer.hpp | 2 +-
.../drawing_commands/draw_contig_command.hpp | 12 +-
.../drawing_commands/draw_missasemblies.hpp | 25 +-
.../drawing_commands/draw_poorly_assembled.hpp | 135 +-
.../drawing_commands/drawing_command.hpp | 7 +-
.../position_commands/fill_position_command.hpp | 11 +-
src/projects/online_vis/processing_commands.hpp | 2 +-
src/projects/online_vis/vis_logger.hpp | 6 +-
.../scaffold_correction/scaffold_correction.hpp | 1 +
src/projects/spades/chromosome_removal.cpp | 2 +-
src/projects/spades/distance_estimation.cpp | 5 +-
src/projects/spades/gap_closer.cpp | 7 +-
src/projects/spades/main.cpp | 1 +
src/projects/spades/mismatch_correction.cpp | 8 +-
src/projects/spades/pacbio_aligning.cpp | 8 +-
src/projects/spades/pair_info_count.cpp | 12 +-
src/projects/spades/repeat_resolving.cpp | 35 +-
src/projects/truseq_analysis/analysis_pipeline.cpp | 1 +
src/spades_pipeline/options_storage.py | 84 +-
src/spades_pipeline/spades_logic.py | 54 +-
src/utils/adt/bf.hpp | 174 +++
src/utils/adt/hll.hpp | 69 +
131 files changed, 5295 insertions(+), 3576 deletions(-)
diff --git a/VERSION b/VERSION
index 0418bab..a5c4c76 100644
--- a/VERSION
+++ b/VERSION
@@ -1,2 +1 @@
-3.8.2
-
+3.9.0
diff --git a/changelog.html b/changelog.html
index e83ce50..a60139f 100644
--- a/changelog.html
+++ b/changelog.html
@@ -3,6 +3,14 @@
<h2>SPAdes Genome Assembler changelog</h2>
+<h3>SPAdes 3.9.0, 23 July 2016</h3>
+
+<p>NEW: rnaSPAdes pipeline for de novo transcriptome assembly from RNA-Seq data.</p>
+
+<p>CHANGE: Improved memory consumption in metagenomic pipeline.</p>
+
+<p>FIX: Several minor bugs.</p>
+
<h3>SPAdes 3.8.2, 10 July 2016</h3>
<p> FIX: Several minor bug-fixes for metaSPAdes and SPAdes pipelines.</p>
diff --git a/configs/debruijn/config.info b/configs/debruijn/config.info
index f747c29..1620f30 100644
--- a/configs/debruijn/config.info
+++ b/configs/debruijn/config.info
@@ -21,8 +21,10 @@ output_base ./data/debruijn/
tmp_dir spades_tmp/
main_iteration true
+; iterative mode switcher, activates additional contigs usage
+use_additional_contigs false
additional_contigs tmp_contigs.fasta
-load_from latest/saves/ ; tmp or latest
+load_from latest/saves/ ; tmp or latest
; Multithreading options
temp_bin_reads_dir .bin_reads/
@@ -43,10 +45,13 @@ scaffold_correction_mode false
; enabled (1) or disabled (0) repeat resolution (former "paired_mode")
rr_enable true
+;preserve raw paired index after distance estimation
+preserve_raw_paired_index false
+
; two-step pipeline
two_step_rr false
; enables/disables usage of intermediate contigs in two-step pipeline
-use_intermediate_contigs true
+use_intermediate_contigs false
;use single reads for rr (all | only_single_libs | none )
single_reads_rr only_single_libs
@@ -64,9 +69,6 @@ compute_paths_number false
; End of developer_mode parameters
-; iterative mode switcher, activates additional contigs usage
-use_additional_contigs false
-
; use unipaths as additional contigs instead of just graph edges
use_unipaths false
diff --git a/configs/debruijn/detail_info_printer.info b/configs/debruijn/detail_info_printer.info
index c055798..1336be8 100644
--- a/configs/debruijn/detail_info_printer.info
+++ b/configs/debruijn/detail_info_printer.info
@@ -4,7 +4,9 @@ info_printers
{
basic_stats false
lib_info false
+ save_all false
save_full_graph false
+ save_graph_pack false
extended_stats false
detailed_dot_write false
write_components false
diff --git a/configs/debruijn/log.properties b/configs/debruijn/log.properties
index a4052e3..cbe4c29 100644
--- a/configs/debruijn/log.properties
+++ b/configs/debruijn/log.properties
@@ -1,5 +1,7 @@
default=INFO
+#ConditionParser=DEBUG
+
#RelativeCoverageHelper=TRACE
#RelativelyLowCoveredComponentSearcher=TRACE
#RelativelyLowCoveredComponentChecker=TRACE
diff --git a/configs/debruijn/pe_params.info b/configs/debruijn/pe_params.info
index 405ea42..9c838bd 100644
--- a/configs/debruijn/pe_params.info
+++ b/configs/debruijn/pe_params.info
@@ -48,8 +48,9 @@ params {
}
scaffolder {
- on true
+ enabled true
cutoff 2
+ hard_cutoff 0
rel_cutoff 0.1
sum_threshold 3
@@ -70,6 +71,14 @@ params {
min_overlap_length 10
flank_addition_coefficient -5.9
flank_multiplication_coefficient 0.97
+
+ var_coeff 3.0
+ basic_overlap_coeff 2.0
+ }
+
+ path_cleaning
+ {
+ enabled false
}
loop_removal
@@ -91,6 +100,7 @@ params {
min_unique_length 10000
unique_coverage_variation 0.5
; (median * (1+variation) > unique > median * (1 - variation))
+ relative_weight_cutoff 2.0
}
scaffold_graph {
diff --git a/configs/debruijn/rna_mode.info b/configs/debruijn/rna_mode.info
index 727e22c..aad8fec 100644
--- a/configs/debruijn/rna_mode.info
+++ b/configs/debruijn/rna_mode.info
@@ -1,5 +1,7 @@
mode rna
+preserve_raw_paired_index true
+
simp
{
; enable advanced ec removal algo
@@ -10,6 +12,13 @@ simp
; tc_lb: max_tip_length = max((min(k, read_length / 2) * tc_lb), read_length);
condition "{ mmm 3 tc_lb 3.5, cb 100000, rctc 0.1 } { tc_lb 3.5, cb 4, rctc 10000 } { tc_lb 0.1, cb 20, rctc 10000 }"
}
+
+ dead_end
+ {
+ condition "{ tc_lb 3.5, cb 2 }"
+ enabled true
+ }
+
; bulge remover:
br
{
@@ -17,6 +26,7 @@ simp
max_coverage 1000000.0
max_relative_coverage 100000.0 ; bulge_cov < this * not_bulge_cov
}
+
; erroneous connections remover:
ec
{
@@ -24,8 +34,17 @@ simp
; icb: iterative coverage bound
; to_ec_lb: max_ec_length = 2*tip_length(to_ec_lb) - 1
; condition "{ ec_lb 9, icb 40.0 }"
- condition "{ ec_lb 30, icb auto }"
+ condition "{ ec_lb 30, icb 50 }"
}
+
+ ; relative coverage erroneous connections remover:
+ rcec
+ {
+ enabled true
+ rcec_lb 30
+ rcec_cb 0.5
+ }
+
rcc
{
enabled true
@@ -39,9 +58,9 @@ simp
;all topology based erroneous connection removers are off
ier
{
- enabled true
- max_length 200
- max_coverage 4
+ enabled false
+ max_length 100
+ max_coverage 2
max_length_any_cov 0 ; will be taken max with read_length
}
; hidden ec remover
@@ -69,7 +88,26 @@ simp
pe {
params {
- multi_path_extend true
- remove_overlaps false
+ ;multi_path_extend true
+ ;remove_overlaps false
+
+ scaffolder {
+ cutoff 1
+ hard_cutoff 10
+
+ cluster_info false
+
+ min_overlap_for_rna_scaffolding 10
+ }
+
+ path_cleaning
+ {
+ enabled true
+ min_length 30
+ isolated_min_length 50
+ min_length_for_low_covered 150
+ min_coverage 2
+ }
+
}
}
diff --git a/configs/debruijn/simplification.info b/configs/debruijn/simplification.info
index 6e05f34..4351abd 100644
--- a/configs/debruijn/simplification.info
+++ b/configs/debruijn/simplification.info
@@ -47,6 +47,14 @@ simp
condition "{ to_ec_lb 5, icb auto }"
; condition "{ ec_lb 9, icb 40.0 }"
}
+
+ ; relative coverage erroneous connections remover:
+ rcec
+ {
+ enabled false
+ rcec_lb 30
+ rcec_cb 0.5
+ }
; relative coverage erroneous component remover:
rcc
@@ -195,5 +203,10 @@ simp
; negative value to disable
disconnect_flank_cov -1.0
}
+
+ dead_end {
+ enabled false
+ condition ""
+ }
}
diff --git a/configs/debruijn/simplification.info.template b/configs/debruijn/simplification.info.template
new file mode 100644
index 0000000..e69de29
diff --git a/ext/include/cxxopts/cxxopts.hpp b/ext/include/cxxopts/cxxopts.hpp
new file mode 100644
index 0000000..4a00b4f
--- /dev/null
+++ b/ext/include/cxxopts/cxxopts.hpp
@@ -0,0 +1,1368 @@
+/*
+
+Copyright (c) 2014, 2015, 2016 Jarryd Beck
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+*/
+
+#ifndef CXX_OPTS_HPP
+#define CXX_OPTS_HPP
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wnon-virtual-dtor"
+#endif
+
+#include <cstring>
+#include <exception>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <regex>
+#include <sstream>
+#include <string>
+#include <vector>
+
+//when we ask cxxopts to use Unicode, help strings are processed using ICU,
+//which results in the correct lengths being computed for strings when they
+//are formatted for the help output
+//it is necessary to make sure that <unicode/unistr.h> can be found by the
+//compiler, and that icu-uc is linked in to the binary.
+
+#ifdef CXXOPTS_USE_UNICODE
+#include <unicode/unistr.h>
+
+namespace cxxopts
+{
+ typedef icu::UnicodeString String;
+
+ inline
+ String
+ toLocalString(std::string s)
+ {
+ return icu::UnicodeString::fromUTF8(s);
+ }
+
+ class UnicodeStringIterator : public
+ std::iterator<std::forward_iterator_tag, int32_t>
+ {
+ public:
+
+ UnicodeStringIterator(const icu::UnicodeString* s, int32_t pos)
+ : s(s)
+ , i(pos)
+ {
+ }
+
+ value_type
+ operator*() const
+ {
+ return s->char32At(i);
+ }
+
+ bool
+ operator==(const UnicodeStringIterator& rhs) const
+ {
+ return s == rhs.s && i == rhs.i;
+ }
+
+ bool
+ operator!=(const UnicodeStringIterator& rhs) const
+ {
+ return !(*this == rhs);
+ }
+
+ UnicodeStringIterator&
+ operator++()
+ {
+ ++i;
+ return *this;
+ }
+
+ UnicodeStringIterator
+ operator+(int32_t v)
+ {
+ return UnicodeStringIterator(s, i + v);
+ }
+
+ private:
+ const icu::UnicodeString* s;
+ int32_t i;
+ };
+
+ inline
+ String&
+ stringAppend(String&s, String a)
+ {
+ return s.append(std::move(a));
+ }
+
+ inline
+ String&
+ stringAppend(String& s, int n, UChar32 c)
+ {
+ for (int i = 0; i != n; ++i)
+ {
+ s.append(c);
+ }
+
+ return s;
+ }
+
+ template <typename Iterator>
+ String&
+ stringAppend(String& s, Iterator begin, Iterator end)
+ {
+ while (begin != end)
+ {
+ s.append(*begin);
+ ++begin;
+ }
+
+ return s;
+ }
+
+ inline
+ size_t
+ stringLength(const String& s)
+ {
+ return s.length();
+ }
+
+ inline
+ std::string
+ toUTF8String(const String& s)
+ {
+ std::string result;
+ s.toUTF8String(result);
+
+ return result;
+ }
+}
+
+namespace std
+{
+ cxxopts::UnicodeStringIterator
+ begin(const icu::UnicodeString& s)
+ {
+ return cxxopts::UnicodeStringIterator(&s, 0);
+ }
+
+ cxxopts::UnicodeStringIterator
+ end(const icu::UnicodeString& s)
+ {
+ return cxxopts::UnicodeStringIterator(&s, s.length());
+ }
+}
+
+//ifdef CXXOPTS_USE_UNICODE
+#else
+
+namespace cxxopts
+{
+ typedef std::string String;
+
+ template <typename T>
+ T
+ toLocalString(T&& t)
+ {
+ return t;
+ }
+
+ inline
+ size_t
+ stringLength(const String& s)
+ {
+ return s.length();
+ }
+
+ inline
+ String&
+ stringAppend(String&s, String a)
+ {
+ return s.append(std::move(a));
+ }
+
+ inline
+ String&
+ stringAppend(String& s, size_t n, char c)
+ {
+ return s.append(n, c);
+ }
+
+ template <typename Iterator>
+ String&
+ stringAppend(String& s, Iterator begin, Iterator end)
+ {
+ return s.append(begin, end);
+ }
+
+ template <typename T>
+ std::string
+ toUTF8String(T&& t)
+ {
+ return std::forward<T>(t);
+ }
+
+}
+
+//ifdef CXXOPTS_USE_UNICODE
+#endif
+
+namespace cxxopts
+{
+ class Value : public std::enable_shared_from_this<Value>
+ {
+ public:
+
+ virtual void
+ parse(const std::string& text) const = 0;
+
+ virtual void
+ parse() const = 0;
+
+ virtual bool
+ has_arg() const = 0;
+
+ virtual bool
+ has_default() const = 0;
+
+ virtual bool
+ is_container() const = 0;
+
+ virtual bool
+ has_implicit() const = 0;
+
+ virtual std::string
+ get_default_value() const = 0;
+
+ virtual std::string
+ get_implicit_value() const = 0;
+
+ virtual std::shared_ptr<Value>
+ default_value(const std::string& value) = 0;
+
+ virtual std::shared_ptr<Value>
+ implicit_value(const std::string& value) = 0;
+ };
+
+ class OptionException : public std::exception
+ {
+ public:
+ OptionException(const std::string& message)
+ : m_message(message)
+ {
+ }
+
+ virtual const char*
+ what() const noexcept
+ {
+ return m_message.c_str();
+ }
+
+ private:
+ std::string m_message;
+ };
+
+ class OptionSpecException : public OptionException
+ {
+ public:
+
+ OptionSpecException(const std::string& message)
+ : OptionException(message)
+ {
+ }
+ };
+
+ class OptionParseException : public OptionException
+ {
+ public:
+ OptionParseException(const std::string& message)
+ : OptionException(message)
+ {
+ }
+ };
+
+ class option_exists_error : public OptionSpecException
+ {
+ public:
+ option_exists_error(const std::string& option)
+ : OptionSpecException(u8"Option ‘" + option + u8"’ already exists")
+ {
+ }
+ };
+
+ class invalid_option_format_error : public OptionSpecException
+ {
+ public:
+ invalid_option_format_error(const std::string& format)
+ : OptionSpecException(u8"Invalid option format ‘" + format + u8"’")
+ {
+ }
+ };
+
+ class option_not_exists_exception : public OptionParseException
+ {
+ public:
+ option_not_exists_exception(const std::string& option)
+ : OptionParseException(u8"Option ‘" + option + u8"’ does not exist")
+ {
+ }
+ };
+
+ class missing_argument_exception : public OptionParseException
+ {
+ public:
+ missing_argument_exception(const std::string& option)
+ : OptionParseException(u8"Option ‘" + option + u8"’ is missing an argument")
+ {
+ }
+ };
+
+ class option_requires_argument_exception : public OptionParseException
+ {
+ public:
+ option_requires_argument_exception(const std::string& option)
+ : OptionParseException(u8"Option ‘" + option + u8"’ requires an argument")
+ {
+ }
+ };
+
+ class option_not_has_argument_exception : public OptionParseException
+ {
+ public:
+ option_not_has_argument_exception
+ (
+ const std::string& option,
+ const std::string& arg
+ )
+ : OptionParseException(
+ u8"Option ‘" + option + u8"’ does not take an argument, but argument‘"
+ + arg + "’ given")
+ {
+ }
+ };
+
+ class option_not_present_exception : public OptionParseException
+ {
+ public:
+ option_not_present_exception(const std::string& option)
+ : OptionParseException(u8"Option ‘" + option + u8"’ not present")
+ {
+ }
+ };
+
+ class argument_incorrect_type : public OptionParseException
+ {
+ public:
+ argument_incorrect_type
+ (
+ const std::string& arg
+ )
+ : OptionParseException(
+ u8"Argument ‘" + arg + u8"’ failed to parse"
+ )
+ {
+ }
+ };
+
+ namespace values
+ {
+ template <typename T>
+ void
+ parse_value(const std::string& text, T& value)
+ {
+ std::istringstream is(text);
+ if (!(is >> value))
+ {
+ throw argument_incorrect_type(text);
+ }
+
+ if (is.rdbuf()->in_avail() != 0)
+ {
+ throw argument_incorrect_type(text);
+ }
+ }
+
+ template <typename T>
+ void
+ parse_value(const std::string& text, std::vector<T>& value)
+ {
+ T v;
+ parse_value(text, v);
+ value.push_back(v);
+ }
+
+ inline
+ void
+ parse_value(const std::string& /*text*/, bool& value)
+ {
+ //TODO recognise on, off, yes, no, enable, disable
+ //so that we can write --long=yes explicitly
+ value = true;
+ }
+
+ inline
+ void
+ parse_value(const std::string& text, std::string& value)
+ {
+ value = text;
+ }
+
+ template <typename T>
+ struct value_has_arg
+ {
+ static constexpr bool value = true;
+ };
+
+ template <>
+ struct value_has_arg<bool>
+ {
+ static constexpr bool value = false;
+ };
+
+ template <typename T>
+ struct type_is_container
+ {
+ static constexpr bool value = false;
+ };
+
+ template <typename T>
+ struct type_is_container<std::vector<T>>
+ {
+ static constexpr bool value = true;
+ };
+
+ template <typename T>
+ class standard_value : public Value
+ {
+ public:
+ standard_value()
+ : m_result(std::make_shared<T>())
+ , m_store(m_result.get())
+ {
+ }
+
+ standard_value(T* t)
+ : m_store(t)
+ {
+ }
+
+ void
+ parse(const std::string& text) const
+ {
+ if (m_implicit && text.empty())
+ {
+ parse_value(m_implicit_value, *m_store);
+ }
+ else
+ {
+ parse_value(text, *m_store);
+ }
+ }
+
+ bool
+ is_container() const
+ {
+ return type_is_container<T>::value;
+ }
+
+ void
+ parse() const
+ {
+ parse_value(m_default_value, *m_store);
+ }
+
+ bool
+ has_arg() const
+ {
+ return value_has_arg<T>::value;
+ }
+
+ bool
+ has_default() const
+ {
+ return m_default;
+ }
+
+ bool
+ has_implicit() const
+ {
+ return m_implicit;
+ }
+
+ virtual std::shared_ptr<Value>
+ default_value(const std::string& value){
+ m_default = true;
+ m_default_value = value;
+ return shared_from_this();
+ }
+
+ virtual std::shared_ptr<Value>
+ implicit_value(const std::string& value){
+ m_implicit = true;
+ m_implicit_value = value;
+ return shared_from_this();
+ }
+
+ std::string
+ get_default_value() const
+ {
+ return m_default_value;
+ }
+
+ std::string
+ get_implicit_value() const
+ {
+ return m_implicit_value;
+ }
+
+ const T&
+ get() const
+ {
+ if (m_store == nullptr)
+ {
+ return *m_result;
+ }
+ else
+ {
+ return *m_store;
+ }
+ }
+
+ protected:
+ std::shared_ptr<T> m_result;
+ T* m_store;
+ bool m_default = false;
+ std::string m_default_value;
+ bool m_implicit = false;
+ std::string m_implicit_value;
+ };
+ }
+
+ template <typename T>
+ std::shared_ptr<Value>
+ value()
+ {
+ return std::make_shared<values::standard_value<T>>();
+ }
+
+ template <typename T>
+ std::shared_ptr<Value>
+ value(T& t)
+ {
+ return std::make_shared<values::standard_value<T>>(&t);
+ }
+
+ class OptionAdder;
+
+ class OptionDetails
+ {
+ public:
+ OptionDetails
+ (
+ const String& description,
+ std::shared_ptr<const Value> value
+ )
+ : m_desc(description)
+ , m_value(value)
+ , m_count(0)
+ {
+ }
+
+ const String&
+ description() const
+ {
+ return m_desc;
+ }
+
+ bool
+ has_arg() const
+ {
+ return m_value->has_arg();
+ }
+
+ void
+ parse(const std::string& text)
+ {
+ m_value->parse(text);
+ ++m_count;
+ }
+
+ void
+ parse_default()
+ {
+ m_value->parse();
+ }
+
+ int
+ count() const
+ {
+ return m_count;
+ }
+
+ const Value& value() const {
+ return *m_value;
+ }
+
+ template <typename T>
+ const T&
+ as() const
+ {
+#ifdef CXXOPTS_NO_RTTI
+ return static_cast<const values::standard_value<T>&>(*m_value).get();
+#else
+ return dynamic_cast<const values::standard_value<T>&>(*m_value).get();
+#endif
+ }
+
+ private:
+ String m_desc;
+ std::shared_ptr<const Value> m_value;
+ int m_count;
+ };
+
+ struct HelpOptionDetails
+ {
+ std::string s;
+ std::string l;
+ String desc;
+ bool has_arg;
+ bool has_default;
+ std::string default_value;
+ bool has_implicit;
+ std::string implicit_value;
+ std::string arg_help;
+ };
+
+ struct HelpGroupDetails
+ {
+ std::string name;
+ std::string description;
+ std::vector<HelpOptionDetails> options;
+ };
+
+ class Options
+ {
+ public:
+
+ Options(std::string program, std::string help_string = "")
+ : m_program(std::move(program))
+ , m_help_string(toLocalString(std::move(help_string)))
+ {
+ }
+
+ inline
+ void
+ parse(int& argc, char**& argv);
+
+ inline
+ OptionAdder
+ add_options(std::string group = "");
+
+ inline
+ void
+ add_option
+ (
+ const std::string& group,
+ const std::string& s,
+ const std::string& l,
+ std::string desc,
+ std::shared_ptr<const Value> value,
+ std::string arg_help
+ );
+
+ int
+ count(const std::string& o) const
+ {
+ auto iter = m_options.find(o);
+ if (iter == m_options.end())
+ {
+ return 0;
+ }
+
+ return iter->second->count();
+ }
+
+ const OptionDetails&
+ operator[](const std::string& option) const
+ {
+ auto iter = m_options.find(option);
+
+ if (iter == m_options.end())
+ {
+ throw option_not_present_exception(option);
+ }
+
+ return *iter->second;
+ }
+
+ //parse positional arguments into the given option
+ inline
+ void
+ parse_positional(std::string option);
+
+ inline
+ void
+ parse_positional(std::vector<std::string> options);
+
+ inline
+ std::string
+ help(const std::vector<std::string>& groups = {""}) const;
+
+ inline
+ const std::vector<std::string>
+ groups() const;
+
+ inline
+ const HelpGroupDetails&
+ group_help(const std::string& group) const;
+
+ private:
+
+ inline
+ void
+ add_one_option
+ (
+ const std::string& option,
+ std::shared_ptr<OptionDetails> details
+ );
+
+ inline
+ bool
+ consume_positional(std::string a);
+
+ inline
+ void
+ add_to_option(const std::string& option, const std::string& arg);
+
+ inline
+ void
+ parse_option
+ (
+ std::shared_ptr<OptionDetails> value,
+ const std::string& name,
+ const std::string& arg = ""
+ );
+
+ inline
+ void
+ checked_parse_arg
+ (
+ int argc,
+ char* argv[],
+ int& current,
+ std::shared_ptr<OptionDetails> value,
+ const std::string& name
+ );
+
+ inline
+ String
+ help_one_group(const std::string& group) const;
+
+ std::string m_program;
+ String m_help_string;
+
+ std::map<std::string, std::shared_ptr<OptionDetails>> m_options;
+ std::vector<std::string> m_positional;
+ std::vector<std::string>::iterator m_next_positional;
+
+ //mapping from groups to help options
+ std::map<std::string, HelpGroupDetails> m_help;
+ };
+
+ class OptionAdder
+ {
+ public:
+
+ OptionAdder(Options& options, std::string group)
+ : m_options(options), m_group(std::move(group))
+ {
+ }
+
+ inline
+ OptionAdder&
+ operator()
+ (
+ const std::string& opts,
+ const std::string& desc,
+ std::shared_ptr<const Value> value
+ = ::cxxopts::value<bool>(),
+ std::string arg_help = ""
+ );
+
+ private:
+ Options& m_options;
+ std::string m_group;
+ };
+
+}
+
+namespace cxxopts
+{
+
+ namespace
+ {
+
+ constexpr int OPTION_LONGEST = 30;
+ constexpr int OPTION_DESC_GAP = 2;
+
+ std::basic_regex<char> option_matcher
+ ("--([[:alnum:]][-_[:alnum:]]+)(=(.*))?|-([a-zA-Z]+)");
+
+ std::basic_regex<char> option_specifier
+ ("(([a-zA-Z]),)?([a-zA-Z0-9][-_a-zA-Z0-9]+)");
+
+ String
+ format_option
+ (
+ const HelpOptionDetails& o
+ )
+ {
+ auto& s = o.s;
+ auto& l = o.l;
+
+ String result = " ";
+
+ if (s.size() > 0)
+ {
+ result += "-" + toLocalString(s) + ",";
+ }
+ else
+ {
+ result += " ";
+ }
+
+ if (l.size() > 0)
+ {
+ result += " --" + toLocalString(l);
+ }
+
+ if (o.has_arg)
+ {
+ auto arg = o.arg_help.size() > 0 ? toLocalString(o.arg_help) : "arg";
+
+ if (o.has_implicit)
+ {
+ result += " [=" + arg + "(=" + toLocalString(o.implicit_value) + ")]";
+ }
+ else
+ {
+ result += " " + arg;
+ }
+ }
+
+ return result;
+ }
+
+ String
+ format_description
+ (
+ const HelpOptionDetails& o,
+ size_t start,
+ size_t width
+ )
+ {
+ auto desc = o.desc;
+
+ if (o.has_default)
+ {
+ desc += toLocalString(" (default:" + o.default_value + ")");
+ }
+
+ String result;
+
+ auto current = std::begin(desc);
+ auto startLine = current;
+ auto lastSpace = current;
+
+ auto size = size_t{};
+
+ while (current != std::end(desc))
+ {
+ if (*current == ' ')
+ {
+ lastSpace = current;
+ }
+
+ if (size > width)
+ {
+ if (lastSpace == startLine)
+ {
+ stringAppend(result, startLine, current + 1);
+ stringAppend(result, "\n");
+ stringAppend(result, start, ' ');
+ startLine = current + 1;
+ lastSpace = startLine;
+ }
+ else
+ {
+ stringAppend(result, startLine, lastSpace);
+ stringAppend(result, "\n");
+ stringAppend(result, start, ' ');
+ startLine = lastSpace + 1;
+ }
+ size = 0;
+ }
+ else
+ {
+ ++size;
+ }
+
+ ++current;
+ }
+
+ //append whatever is left
+ stringAppend(result, startLine, current);
+
+ return result;
+ }
+ }
+
+OptionAdder
+Options::add_options(std::string group)
+{
+ return OptionAdder(*this, std::move(group));
+}
+
+OptionAdder&
+OptionAdder::operator()
+(
+ const std::string& opts,
+ const std::string& desc,
+ std::shared_ptr<const Value> value,
+ std::string arg_help
+)
+{
+ std::match_results<const char*> result;
+ std::regex_match(opts.c_str(), result, option_specifier);
+
+ if (result.empty())
+ {
+ throw invalid_option_format_error(opts);
+ }
+
+ const auto& s = result[2];
+ const auto& l = result[3];
+
+ m_options.add_option(m_group, s.str(), l.str(), desc, value,
+ std::move(arg_help));
+
+ return *this;
+}
+
+void
+Options::parse_option
+(
+ std::shared_ptr<OptionDetails> value,
+ const std::string& /*name*/,
+ const std::string& arg
+)
+{
+ value->parse(arg);
+}
+
+void
+Options::checked_parse_arg
+(
+ int argc,
+ char* argv[],
+ int& current,
+ std::shared_ptr<OptionDetails> value,
+ const std::string& name
+)
+{
+ if (current + 1 >= argc)
+ {
+ if (value->value().has_implicit())
+ {
+ parse_option(value, name, "");
+ }
+ else
+ {
+ throw missing_argument_exception(name);
+ }
+ }
+ else
+ {
+ if (argv[current + 1][0] == '-' && value->value().has_implicit())
+ {
+ parse_option(value, name, "");
+ }
+ else
+ {
+ parse_option(value, name, argv[current + 1]);
+ ++current;
+ }
+ }
+}
+
+void
+Options::add_to_option(const std::string& option, const std::string& arg)
+{
+ auto iter = m_options.find(option);
+
+ if (iter == m_options.end())
+ {
+ throw option_not_exists_exception(option);
+ }
+
+ parse_option(iter->second, option, arg);
+}
+
+bool
+Options::consume_positional(std::string a)
+{
+ if (m_next_positional != m_positional.end())
+ {
+ add_to_option(*m_next_positional, a);
+
+ auto iter = m_options.find(*m_next_positional);
+ if (iter != m_options.end() && !iter->second->value().is_container()) {
+ ++m_next_positional;
+ }
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+}
+
+void
+Options::parse_positional(std::string option)
+{
+ parse_positional(std::vector<std::string>{option});
+}
+
+void
+Options::parse_positional(std::vector<std::string> options)
+{
+ m_positional = std::move(options);
+ m_next_positional = m_positional.begin();
+}
+
+void
+Options::parse(int& argc, char**& argv)
+{
+ int current = 1;
+
+ int nextKeep = 1;
+
+ bool consume_remaining = false;
+
+ while (current != argc)
+ {
+ if (strcmp(argv[current], "--") == 0)
+ {
+ consume_remaining = true;
+ ++current;
+ break;
+ }
+
+ std::match_results<const char*> result;
+ std::regex_match(argv[current], result, option_matcher);
+
+ if (result.empty())
+ {
+ //not a flag
+
+ //if true is returned here then it was consumed, otherwise it is
+ //ignored
+ if (consume_positional(argv[current]))
+ {
+ }
+ else
+ {
+ argv[nextKeep] = argv[current];
+ ++nextKeep;
+ }
+ //if we return from here then it was parsed successfully, so continue
+ }
+ else
+ {
+ //short or long option?
+ if (result[4].length() != 0)
+ {
+ const std::string& s = result[4];
+
+ for (std::size_t i = 0; i != s.size(); ++i)
+ {
+ std::string name(1, s[i]);
+ auto iter = m_options.find(name);
+
+ if (iter == m_options.end())
+ {
+ throw option_not_exists_exception(name);
+ }
+
+ auto value = iter->second;
+
+ //if no argument then just add it
+ if (!value->has_arg())
+ {
+ parse_option(value, name);
+ }
+ else
+ {
+ //it must be the last argument
+ if (i + 1 == s.size())
+ {
+ checked_parse_arg(argc, argv, current, value, name);
+ }
+ else if (value->value().has_implicit())
+ {
+ parse_option(value, name, "");
+ }
+ else
+ {
+ //error
+ throw option_requires_argument_exception(name);
+ }
+ }
+ }
+ }
+ else if (result[1].length() != 0)
+ {
+ const std::string& name = result[1];
+
+ auto iter = m_options.find(name);
+
+ if (iter == m_options.end())
+ {
+ throw option_not_exists_exception(name);
+ }
+
+ auto opt = iter->second;
+
+ //equals provided for long option?
+ if (result[3].length() != 0)
+ {
+ //parse the option given
+
+ //but if it doesn't take an argument, this is an error
+ if (!opt->has_arg())
+ {
+ throw option_not_has_argument_exception(name, result[3]);
+ }
+
+ parse_option(opt, name, result[3]);
+ }
+ else
+ {
+ if (opt->has_arg())
+ {
+ //parse the next argument
+ checked_parse_arg(argc, argv, current, opt, name);
+ }
+ else
+ {
+ //parse with empty argument
+ parse_option(opt, name);
+ }
+ }
+ }
+
+ }
+
+ ++current;
+ }
+
+ for (auto& opt : m_options)
+ {
+ auto& detail = opt.second;
+ auto& value = detail->value();
+
+ if(!detail->count() && value.has_default()){
+ detail->parse_default();
+ }
+ }
+
+ if (consume_remaining)
+ {
+ while (current < argc)
+ {
+ consume_positional(argv[current]);
+ ++current;
+ }
+ }
+
+ argc = nextKeep;
+
+}
+
+void
+Options::add_option
+(
+ const std::string& group,
+ const std::string& s,
+ const std::string& l,
+ std::string desc,
+ std::shared_ptr<const Value> value,
+ std::string arg_help
+)
+{
+ auto stringDesc = toLocalString(std::move(desc));
+ auto option = std::make_shared<OptionDetails>(stringDesc, value);
+
+ if (s.size() > 0)
+ {
+ add_one_option(s, option);
+ }
+
+ if (l.size() > 0)
+ {
+ add_one_option(l, option);
+ }
+
+ //add the help details
+ auto& options = m_help[group];
+
+ options.options.emplace_back(HelpOptionDetails{s, l, stringDesc,
+ value->has_arg(),
+ value->has_default(), value->get_default_value(),
+ value->has_implicit(), value->get_implicit_value(),
+ std::move(arg_help)});
+}
+
+void
+Options::add_one_option
+(
+ const std::string& option,
+ std::shared_ptr<OptionDetails> details
+)
+{
+ auto in = m_options.emplace(option, details);
+
+ if (!in.second)
+ {
+ throw option_exists_error(option);
+ }
+}
+
+String
+Options::help_one_group(const std::string& g) const
+{
+ typedef std::vector<std::pair<String, String>> OptionHelp;
+
+ auto group = m_help.find(g);
+ if (group == m_help.end())
+ {
+ return "";
+ }
+
+ OptionHelp format;
+
+ size_t longest = 0;
+
+ String result;
+
+ if (!g.empty())
+ {
+ result += toLocalString(" " + g + " options:\n\n");
+ }
+
+ for (const auto& o : group->second.options)
+ {
+ auto s = format_option(o);
+ longest = std::max(longest, stringLength(s));
+ format.push_back(std::make_pair(s, String()));
+ }
+
+ longest = std::min(longest, static_cast<size_t>(OPTION_LONGEST));
+
+ //widest allowed description
+ auto allowed = size_t{76} - longest - OPTION_DESC_GAP;
+
+ auto fiter = format.begin();
+ for (const auto& o : group->second.options)
+ {
+ auto d = format_description(o, longest + OPTION_DESC_GAP, allowed);
+
+ result += fiter->first;
+ if (stringLength(fiter->first) > longest)
+ {
+ result += "\n";
+ result += toLocalString(std::string(longest + OPTION_DESC_GAP, ' '));
+ }
+ else
+ {
+ result += toLocalString(std::string(longest + OPTION_DESC_GAP -
+ stringLength(fiter->first),
+ ' '));
+ }
+ result += d;
+ result += "\n";
+
+ ++fiter;
+ }
+
+ return result;
+}
+
+std::string
+Options::help(const std::vector<std::string>& groups) const
+{
+ String result = "Usage:\n " + toLocalString(m_program) + " [OPTION...]"
+ + m_help_string + "\n\n";
+
+ for (std::size_t i = 0; i < groups.size(); ++i)
+ {
+ result += help_one_group(groups[i]);
+ if (i < groups.size() - 1)
+ {
+ result += "\n";
+ }
+ }
+
+ return toUTF8String(result);
+}
+
+const std::vector<std::string>
+Options::groups() const
+{
+ std::vector<std::string> g;
+
+ std::transform(
+ m_help.begin(),
+ m_help.end(),
+ std::back_inserter(g),
+ [] (const std::map<std::string, HelpGroupDetails>::value_type& pair)
+ {
+ return pair.first;
+ }
+ );
+
+ return g;
+}
+
+const HelpGroupDetails&
+Options::group_help(const std::string& group) const
+{
+ return m_help.at(group);
+}
+
+}
+
+#if defined(__GNU__)
+#pragma GCC diagnostic pop
+#endif
+
+#endif //CXX_OPTS_HPP
diff --git a/ext/include/llvm/Support/MathExtras.h b/ext/include/llvm/Support/MathExtras.h
index 8c0b110..e6f8ffa 100644
--- a/ext/include/llvm/Support/MathExtras.h
+++ b/ext/include/llvm/Support/MathExtras.h
@@ -19,7 +19,6 @@
#include <cassert>
#include <cstring>
#include <type_traits>
-#include <cstdint>
namespace llvm {
/// \brief The behavior an operation has on an input of 0.
diff --git a/ext/src/bamtools/api/internal/io/pbgzf/bgzf.c b/ext/src/bamtools/api/internal/io/pbgzf/bgzf.c
index 46d5c46..ef41efb 100644
--- a/ext/src/bamtools/api/internal/io/pbgzf/bgzf.c
+++ b/ext/src/bamtools/api/internal/io/pbgzf/bgzf.c
@@ -49,7 +49,7 @@ typedef struct {
KHASH_MAP_INIT_INT64(cache, cache_t)
#endif
-inline void
+static void
packInt16(uint8_t* buffer, uint16_t value)
{
buffer[0] = value;
@@ -62,7 +62,7 @@ unpackInt16(const uint8_t *buffer)
return buffer[0] | buffer[1] << 8;
}
-inline void
+static void
packInt32(uint8_t *buffer, uint32_t value)
{
buffer[0] = value;
diff --git a/ext/src/bamtools/api/internal/io/pbgzf/bgzf.h b/ext/src/bamtools/api/internal/io/pbgzf/bgzf.h
index 23cb162..36d443b 100644
--- a/ext/src/bamtools/api/internal/io/pbgzf/bgzf.h
+++ b/ext/src/bamtools/api/internal/io/pbgzf/bgzf.h
@@ -234,11 +234,11 @@ extern "C" {
*/
int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks);
- inline void
+ static void
packInt16(uint8_t* buffer, uint16_t value);
inline int
unpackInt16(const uint8_t* buffer);
- inline void
+ static void
packInt32(uint8_t* buffer, uint32_t value);
int
bgzf_check_header(const uint8_t* header);
diff --git a/ext/src/llvm/CMakeLists.txt b/ext/src/llvm/CMakeLists.txt
index f34f99a..6993c91 100644
--- a/ext/src/llvm/CMakeLists.txt
+++ b/ext/src/llvm/CMakeLists.txt
@@ -19,9 +19,6 @@ check_symbol_exists(backtrace "execinfo.h" HAVE_BACKTRACE)
find_library(DL_LIB NAMES "dl")
-add_definitions(-D__STDC_CONSTANT_MACROS)
-add_definitions(-D__STDC_LIMIT_MACROS)
-
# FIXME: Signal handler return type, currently hardcoded to 'void'
set(RETSIGTYPE void)
diff --git a/manual.html b/manual.html
index e6cbad1..b389735 100644
--- a/manual.html
+++ b/manual.html
@@ -1,6 +1,6 @@
<html>
<head>
- <title>SPAdes 3.8.2 Manual</title>
+ <title>SPAdes 3.9.0 Manual</title>
<style type="text/css">
.code {
background-color: lightgray;
@@ -8,7 +8,7 @@
</style>
</head>
<body>
-<h1>SPAdes 3.8.2 Manual</h1>
+<h1>SPAdes 3.9.0 Manual</h1>
1. <a href="#sec1">About SPAdes</a><br>
1.1. <a href="#sec1.1">Supported data types</a><br>
@@ -35,21 +35,23 @@
<h2>1. About SPAdes</h2>
<p>
SPAdes – St. Petersburg genome assembler – is intended for both standard isolates and single-cell MDA bacteria assemblies. This manual will help you to install and run SPAdes.
-SPAdes version 3.8.2 was released under GPLv2 on July 10, 2016 and can be downloaded from <a href="http://bioinf.spbau.ru/en/spades" target="_blank">http://bioinf.spbau.ru/en/spades</a>.
+SPAdes version 3.9.0 was released under GPLv2 on July 23, 2016 and can be downloaded from <a href="http://bioinf.spbau.ru/en/spades" target="_blank">http://bioinf.spbau.ru/en/spades</a>.
<a name="sec1.1"></a>
<h3>1.1 Supported data types</h3>
<p>
The current version of SPAdes works with Illumina or IonTorrent reads and is capable of providing hybrid assemblies using PacBio, Oxford Nanopore and Sanger reads. You can also provide additional contigs that will be used as long reads.
<p>
- Version 3.8.2 of SPAdes supports paired-end reads, mate-pairs and unpaired reads. SPAdes can take as input several paired-end and mate-pair libraries simultaneously. Note, that SPAdes was initially designed for small genomes. It was tested on single-cell and standard bacterial and fungal data sets. SPAdes is not intended for larger genomes (e.g. mammalian size genomes). For such purposes you can use it at your own risk.
+ Version 3.9.0 of SPAdes supports paired-end reads, mate-pairs and unpaired reads. SPAdes can take as input several paired-end and mate-pair libraries simultaneously. Note, that SPAdes was initially designed for small genomes. It was tested on single-cell and standard bacterial and fungal data sets. SPAdes is not intended for larger genomes (e.g. mammalian size genomes). For such purposes you can use it at your own risk.
<p>
- SPAdes 3.8.2 includes metaSPAdes – a pipeline designed specially for metagenomic data sets. To learn more see <a href="#meta">options</a>.
-<p>
- Also, SPAdes 3.8.2 includes plasmidSPAdes – a pipeline designed for extracting and assembling plasmids from WGS data sets. To learn more see <a href="#plasmid">options</a>.
-<p>
- Additionally, SPAdes has a separate modules for assembling highly polymorphic diploid genomes and for TruSeq barcode assembly. For more information see <a href="dipspades_manual.html" target="_blank">dipSPAdes manual</a> and <a href="truspades_manual.html" target="_blank">truSPAdes manual</a> .
-
+ SPAdes 3.9.0 includes the following additional pipelines:
+ <ul>
+ <li>dipSPAdes – a module for assembling highly polymorphic diploid genomes (see <a href="dipspades_manual.html" target="_blank">dipSPAdes manual</a>).</li>
+ <li>metaSPAdes – a pipeline for metagenomic data sets (see <a href="#meta">metaSPAdes options</a>). </li>
+ <li>plasmidSPAdes – a pipeline for extracting and assembling plasmids from WGS data sets (see <a href="#plasmid">plasmidSPAdes options</a>).</li>
+ <li>rnaSPAdes – a <i>de novo</i> transcriptome assembler from RNA-Seq data (see <a href="rnaspades_manual.html" target="_blank">rnaSPAdes manual</a>).</li>
+ <li>truSPAdes – a module for TruSeq barcode assembly (see <a href="truspades_manual.html" target="_blank">truSPAdes manual</a>). </li>
+ </ul>
<a name="sec1.2"></a>
<h3>1.2 SPAdes pipeline</h3>
@@ -60,8 +62,6 @@ SPAdes comes in several separate modules:
<li> IonHammer – read error correction tool for IonTorrent data, which also works on both types of data. </li>
<li> SPAdes – iterative short-read genome assembly module; values of K are selected automatically based on the read length and data set type. </li>
<li> MismatchCorrector – a tool which improves mismatch and short indel rates in resulting contigs and scaffolds; this module uses the <a href="http://bio-bwa.sourceforge.net" target="_blank">BWA</a> tool [<a href="http://www.ncbi.nlm.nih.gov/pubmed/19451168" target="_blank">Li H. and Durbin R., 2009</a>]; MismatchCorrector is turned off by default, but we recommend to turn it on (see <a href="#correctoropt">SPAdes options section</a>). </li>
- <li> dipSPAdes – module for assembling highly polymorphic diploid genomes; for more information see <a href="dipspades_manual.html" target="_blank">dipSPAdes manual</a>. </li>
- <li> truSPAdes – an assembler for short reads produced by Illumina TruSeq Long Read technology; for more information see <a href="truspades_manual.html" target="_blank">truSPAdes manual</a>. </li>
</ul>
<p>
We recommend to run SPAdes with BayesHammer/IonHammer to obtain high-quality assemblies. However, if you use your own read correction tool, it is possible to turn error correction module off. It is also possible to use only the read error correction stage, if you wish to use another assembler. See the <a href="#pipelineopt">SPAdes options section</a>.
@@ -143,7 +143,7 @@ SPAdes comes in several separate modules:
<li> Running SPAdes without preliminary read error correction (e.g. without BayesHammer or IonHammer) will likely require more time and memory. </li>
<li> Each module removes its temporary files as soon as it finishes. </li>
<li> SPAdes uses 512 Mb per thread for buffers, which results in higher memory consumption. If you set memory limit manually, SPAdes will use smaller buffers and thus less RAM. </li>
- <li> Performance statistics is given for SPAdes version 3.8.2. </li>
+ <li> Performance statistics is given for SPAdes version 3.9.0. </li>
</ul>
@@ -157,13 +157,13 @@ SPAdes comes in several separate modules:
<h3>2.1 Downloading SPAdes Linux binaries</h3>
<p>
- To download <a href="http://spades.bioinf.spbau.ru/release3.8.2/SPAdes-3.8.2-Linux.tar.gz">SPAdes Linux binaries</a> and extract them, go to the directory in which you wish SPAdes to be installed and run:
+ To download <a href="http://spades.bioinf.spbau.ru/release3.9.0/SPAdes-3.9.0-Linux.tar.gz">SPAdes Linux binaries</a> and extract them, go to the directory in which you wish SPAdes to be installed and run:
<pre class="code">
<code>
- wget http://spades.bioinf.spbau.ru/release3.8.2/SPAdes-3.8.2-Linux.tar.gz
- tar -xzf SPAdes-3.8.2-Linux.tar.gz
- cd SPAdes-3.8.2-Linux/bin/
+ wget http://spades.bioinf.spbau.ru/release3.9.0/SPAdes-3.9.0-Linux.tar.gz
+ tar -xzf SPAdes-3.9.0-Linux.tar.gz
+ cd SPAdes-3.9.0-Linux/bin/
</code>
</pre>
@@ -171,14 +171,17 @@ SPAdes comes in several separate modules:
In this case you do not need to run any installation scripts – SPAdes is ready to use. The following files will be placed in the <code>bin</code> directory:
<ul>
<li><code>spades.py</code> (main executable script)</li>
+ <li><code>dipspades.py</code> (main executable script for <a href="dipspades_manual.html" target="_blank">dipSPAdes</a>)</li>
+ <li><code>metaspades.py</code> (main executable script for <a href="#meta">metaSPAdes</a>)</li>
+ <li><code>plasmidspades.py</code> (main executable script for <a href="#plasmid">plasmidSPAdes</a>)</li>
+ <li><code>rnaspades.py</code> (main executable script for <a href="rnaspades_manual.html" target="_blank">rnaSPAdes</a>)</li>
+ <li><code>truspades.py</code> (main executable script for <a href="truspades_manual.html" target="_blank">truSPAdes</a>)</li>
<li><code>hammer</code> (read error correcting module for Illumina reads)</li>
<li><code>ionhammer</code> (read error correcting module for IonTorrent reads)</li>
<li><code>spades</code> (assembly module)</li>
<li><code>bwa-spades</code> (<a href="http://bio-bwa.sourceforge.net" target="_blank">BWA</a> alignment module which is required for mismatch correction)</li>
<li><code>corrector</code> (mismatch correction module)</li>
- <li><code>dipspades.py</code> (main executable script for <a href="dipspades_manual.html" target="_blank">dipSPAdes</a>)</li>
<li><code>dipspades</code> (assembly module for highly polymorphic diploid genomes)</li>
- <li><code>truspades.py</code> (main executable script for <a href="truspades_manual.html" target="_blank">truSPAdes</a>)</li>
<li><code>scaffold_correction</code> (executable used in truSPAdes pipeline)</li>
</ul>
@@ -189,13 +192,13 @@ SPAdes comes in several separate modules:
<h3>2.2 Downloading SPAdes binaries for Mac</h3>
<p>
- To obtain <a href="http://spades.bioinf.spbau.ru/release3.8.2/SPAdes-3.8.2-Darwin.tar.gz">SPAdes binaries for Mac</a>, go to the directory in which you wish SPAdes to be installed and run:
+ To obtain <a href="http://spades.bioinf.spbau.ru/release3.9.0/SPAdes-3.9.0-Darwin.tar.gz">SPAdes binaries for Mac</a>, go to the directory in which you wish SPAdes to be installed and run:
<pre class="code">
<code>
- curl http://spades.bioinf.spbau.ru/release3.8.2/SPAdes-3.8.2-Darwin.tar.gz -o SPAdes-3.8.2-Darwin.tar.gz
- tar -zxf SPAdes-3.8.2-Darwin.tar.gz
- cd SPAdes-3.8.2-Darwin/bin/
+ curl http://spades.bioinf.spbau.ru/release3.9.0/SPAdes-3.9.0-Darwin.tar.gz -o SPAdes-3.9.0-Darwin.tar.gz
+ tar -zxf SPAdes-3.9.0-Darwin.tar.gz
+ cd SPAdes-3.9.0-Darwin/bin/
</code>
</pre>
@@ -203,14 +206,17 @@ SPAdes comes in several separate modules:
Just as in Linux, SPAdes is ready to use and no further installation steps are required. You will get the same files in the <code>bin</code> directory:
<ul>
<li><code>spades.py</code> (main executable script)</li>
+ <li><code>dipspades.py</code> (main executable script for <a href="dipspades_manual.html" target="_blank">dipSPAdes</a>)</li>
+ <li><code>metaspades.py</code> (main executable script for <a href="#meta">metaSPAdes</a>)</li>
+ <li><code>plasmidspades.py</code> (main executable script for <a href="#plasmid">plasmidSPAdes</a>)</li>
+ <li><code>rnaspades.py</code> (main executable script for <a href="rnaspades_manual.html" target="_blank">rnaSPAdes</a>)</li>
+ <li><code>truspades.py</code> (main executable script for <a href="truspades_manual.html" target="_blank">truSPAdes</a>)</li>
<li><code>hammer</code> (read error correcting module for Illumina reads)</li>
<li><code>ionhammer</code> (read error correcting module for IonTorrent reads)</li>
<li><code>spades</code> (assembly module)</li>
<li><code>bwa-spades</code> (<a href="http://bio-bwa.sourceforge.net" target="_blank">BWA</a> alignment module which is required for mismatch correction)</li>
<li><code>corrector</code> (mismatch correction module)</li>
- <li><code>dipspades.py</code> (main executable script for <a href="dipspades_manual.html" target="_blank">dipSPAdes</a>)</li>
<li><code>dipspades</code> (assembly module for highly polymorphic diploid genomes)</li>
- <li><code>truspades.py</code> (main executable script for <a href="truspades_manual.html" target="_blank">truSPAdes</a>)</li>
<li><code>scaffold_correction</code> (executable used in truSPAdes pipeline)</li>
</ul>
@@ -230,13 +236,13 @@ SPAdes comes in several separate modules:
</ul>
<p>
- If you meet these requirements, you can download the <a href="http://spades.bioinf.spbau.ru/release3.8.2/SPAdes-3.8.2.tar.gz">SPAdes source code</a>:
+ If you meet these requirements, you can download the <a href="http://spades.bioinf.spbau.ru/release3.9.0/SPAdes-3.9.0.tar.gz">SPAdes source code</a>:
<pre class="code">
<code>
- wget http://spades.bioinf.spbau.ru/release3.8.2/SPAdes-3.8.2.tar.gz
- tar -xzf SPAdes-3.8.2.tar.gz
- cd SPAdes-3.8.2
+ wget http://spades.bioinf.spbau.ru/release3.9.0/SPAdes-3.9.0.tar.gz
+ tar -xzf SPAdes-3.9.0.tar.gz
+ cd SPAdes-3.9.0
</code>
</pre>
@@ -274,14 +280,17 @@ SPAdes comes in several separate modules:
After installation you will get the same files in <code>./bin</code> (or <code><destination_dir>/bin</code> if you specified PREFIX) directory:
<ul>
<li><code>spades.py</code> (main executable script)</li>
+ <li><code>dipspades.py</code> (main executable script for <a href="dipspades_manual.html" target="_blank">dipSPAdes</a>)</li>
+ <li><code>metaspades.py</code> (main executable script for <a href="#meta">metaSPAdes</a>)</li>
+ <li><code>plasmidspades.py</code> (main executable script for <a href="#plasmid">plasmidSPAdes</a>)</li>
+ <li><code>rnaspades.py</code> (main executable script for <a href="rnaspades_manual.html" target="_blank">rnaSPAdes</a>)</li>
+ <li><code>truspades.py</code> (main executable script for <a href="truspades_manual.html" target="_blank">truSPAdes</a>)</li>
<li><code>hammer</code> (read error correcting module for Illumina reads)</li>
<li><code>ionhammer</code> (read error correcting module for IonTorrent reads)</li>
<li><code>spades</code> (assembly module)</li>
<li><code>bwa-spades</code> (<a href="http://bio-bwa.sourceforge.net" target="_blank">BWA</a> alignment module which is required for mismatch correction)</li>
<li><code>corrector</code> (mismatch correction module)</li>
- <li><code>dipspades.py</code> (main executable script for <a href="dipspades_manual.html" target="_blank">dipSPAdes</a>)</li>
<li><code>dipspades</code> (assembly module for highly polymorphic diploid genomes)</li>
- <li><code>truspades.py</code> (main executable script for <a href="truspades_manual.html" target="_blank">truSPAdes</a>)</li>
<li><code>scaffold_correction</code> (executable used in truSPAdes pipeline)</li>
</ul>
@@ -340,7 +349,7 @@ Thank you for using SPAdes!
SPAdes takes as input paired-end reads, mate-pairs and single (unpaired) reads in FASTA and FASTQ. For IonTorrent data SPAdes also supports unpaired reads in unmapped BAM format (like the one produced by Torrent Server). However, in order to run read error correction, reads should be in FASTQ or BAM format. Sanger, Oxford Nanopore and PacBio CLR reads can be provided in both formats since SPAdes does not run error correction for these types of data.
<p>
- To run SPAdes 3.8.2 you need at least one library of the following types:
+ To run SPAdes 3.9.0 you need at least one library of the following types:
<ul>
<li>Illumina paired-end/high-quality mate-pairs/unpaired reads</li>
<li>IonTorrent paired-end/high-quality mate-pairs/unpaired reads</li>
@@ -468,6 +477,13 @@ Note that we assume that SPAdes installation directory is added to the <code>PAT
See <a href="#sec3.6">section 3.6</a> for plasmidSPAdes output details.
</p>
+
+<a name="rna"></a>
+<p>
+ <code>--rna </code> (same as <code>rnaspades.py</code>)<br>
+ This flag should be used when assembling RNA-Seq data sets (runs rnaSPAdes). To learn more, see <a href="rnaspades_manual.html" target="_blank">rnaSPAdes manual</a>.
+</p>
+
<p>
<code>--iontorrent </code><br>
This flag is required when assembling IonTorrent data. Allows BAM files as input. Carefully read <a href="#sec3.3">section 3.3</a> before using this option.
diff --git a/metaspades.py b/metaspades.py
index 9f5034b..c19e2fb 100755
--- a/metaspades.py
+++ b/metaspades.py
@@ -354,6 +354,11 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
if options_storage.meta:
if options_storage.careful or options_storage.mismatch_corrector or options_storage.cov_cutoff != "off":
support.error("you cannot specify --careful, --mismatch-correction or --cov-cutoff in metagenomic mode!", log)
+ if options_storage.rna:
+ if options_storage.careful:
+ support.error("you cannot specify --careful in RNA-Seq mode!", log)
+ if options_storage.k_mers and options_storage.k_mers != 'auto' and len(options_storage.k_mers) > 1:
+ support.error("you cannot specify multiple k-mer sizes in RNA-Seq mode!", log)
if options_storage.continue_mode:
return None, None
@@ -385,6 +390,12 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
support.check_dataset_reads(dataset_data, options_storage.only_assembler, log)
if not support.get_lib_ids_by_type(dataset_data, spades_logic.READS_TYPES_USED_IN_CONSTRUCTION):
support.error('you should specify at least one unpaired, paired-end, or high-quality mate-pairs library!')
+ if options_storage.rna:
+ if len(dataset_data) != len(support.get_lib_ids_by_type(dataset_data, spades_logic.READS_TYPES_USED_IN_RNA_SEQ)):
+ support.error('you cannot specify any data types except ' +
+ ', '.join(spades_logic.READS_TYPES_USED_IN_RNA_SEQ) + ' in RNA-Seq mode!')
+ if len(support.get_lib_ids_by_type(dataset_data, 'paired-end')) > 1:
+ support.error('you cannot specify more than one paired-end library in RNA-Seq mode!')
options_storage.set_default_values()
### FILLING cfg
@@ -426,6 +437,8 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
options_storage.k_mers = None
if options_storage.k_mers:
cfg["assembly"].__dict__["iterative_K"] = options_storage.k_mers
+ elif options_storage.rna:
+ cfg["assembly"].__dict__["iterative_K"] = options_storage.K_MERS_RNA
else:
cfg["assembly"].__dict__["iterative_K"] = options_storage.K_MERS_SHORT
cfg["assembly"].__dict__["disable_rr"] = options_storage.disable_rr
@@ -688,6 +701,8 @@ def main(args):
result_assembly_graph_filename = os.path.join(cfg["common"].output_dir, options_storage.assembly_graph_name)
result_contigs_paths_filename = os.path.join(cfg["common"].output_dir, options_storage.contigs_paths)
result_scaffolds_paths_filename = os.path.join(cfg["common"].output_dir, options_storage.scaffolds_paths)
+ result_transcripts_filename = os.path.join(cfg["common"].output_dir, options_storage.transcripts_name)
+ result_transcripts_paths_filename = os.path.join(cfg["common"].output_dir, options_storage.transcripts_paths)
truseq_long_reads_file_base = os.path.join(cfg["common"].output_dir, "truseq_long_reads")
truseq_long_reads_file = truseq_long_reads_file_base + ".fasta"
misc_dir = os.path.join(cfg["common"].output_dir, "misc")
@@ -702,6 +717,8 @@ def main(args):
spades_cfg.__dict__["result_graph"] = result_assembly_graph_filename
spades_cfg.__dict__["result_contigs_paths"] = result_contigs_paths_filename
spades_cfg.__dict__["result_scaffolds_paths"] = result_scaffolds_paths_filename
+ spades_cfg.__dict__["result_transcripts"] = result_transcripts_filename
+ spades_cfg.__dict__["result_transcripts_paths"] = result_transcripts_paths_filename
if options_storage.continue_mode and (os.path.isfile(spades_cfg.result_contigs)
or ("mismatch_corrector" in cfg and
@@ -861,20 +878,29 @@ def main(args):
if "assembly" in cfg and os.path.isfile(result_contigs_filename):
message = " * Assembled contigs are in " + support.process_spaces(result_contigs_filename)
log.info(message)
- if "assembly" in cfg and os.path.isfile(result_scaffolds_filename):
- message = " * Assembled scaffolds are in " + support.process_spaces(result_scaffolds_filename)
- log.info(message)
- if "assembly" in cfg and os.path.isfile(result_assembly_graph_filename):
- message = " * Assembly graph is in " + support.process_spaces(result_assembly_graph_filename)
- log.info(message)
- if "assembly" in cfg and os.path.isfile(result_contigs_paths_filename):
- message = " * Paths in the assembly graph corresponding to the contigs are in " + \
- support.process_spaces(result_contigs_paths_filename)
- log.info(message)
- if "assembly" in cfg and os.path.isfile(result_scaffolds_paths_filename):
- message = " * Paths in the assembly graph corresponding to the scaffolds are in " + \
- support.process_spaces(result_scaffolds_paths_filename)
- log.info(message)
+ if options_storage.rna:
+ if "assembly" in cfg and os.path.isfile(result_transcripts_filename):
+ message = " * Assembled transcripts are in " + support.process_spaces(result_transcripts_filename)
+ log.info(message)
+ if "assembly" in cfg and os.path.isfile(result_transcripts_paths_filename):
+ message = " * Paths in the assembly graph corresponding to the transcripts are in " + \
+ support.process_spaces(result_transcripts_paths_filename)
+ log.info(message)
+ else:
+ if "assembly" in cfg and os.path.isfile(result_scaffolds_filename):
+ message = " * Assembled scaffolds are in " + support.process_spaces(result_scaffolds_filename)
+ log.info(message)
+ if "assembly" in cfg and os.path.isfile(result_assembly_graph_filename):
+ message = " * Assembly graph is in " + support.process_spaces(result_assembly_graph_filename)
+ log.info(message)
+ if "assembly" in cfg and os.path.isfile(result_contigs_paths_filename):
+ message = " * Paths in the assembly graph corresponding to the contigs are in " + \
+ support.process_spaces(result_contigs_paths_filename)
+ log.info(message)
+ if "assembly" in cfg and os.path.isfile(result_scaffolds_paths_filename):
+ message = " * Paths in the assembly graph corresponding to the scaffolds are in " + \
+ support.process_spaces(result_scaffolds_paths_filename)
+ log.info(message)
#log.info("")
#breaking scaffolds
@@ -898,6 +924,9 @@ def main(args):
if options_storage.truseq_mode:
if not os.path.isfile(truseq_long_reads_file):
support.error("TEST FAILED: %s does not exist!" % truseq_long_reads_file)
+ elif options_storage.rna:
+ if not os.path.isfile(result_transcripts_filename):
+ support.error("TEST FAILED: %s does not exist!" % result_transcripts_filename)
else:
for result_filename in [result_contigs_filename, result_scaffolds_filename]:
if os.path.isfile(result_filename):
diff --git a/plasmidspades.py b/plasmidspades.py
index 9f5034b..c19e2fb 100755
--- a/plasmidspades.py
+++ b/plasmidspades.py
@@ -354,6 +354,11 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
if options_storage.meta:
if options_storage.careful or options_storage.mismatch_corrector or options_storage.cov_cutoff != "off":
support.error("you cannot specify --careful, --mismatch-correction or --cov-cutoff in metagenomic mode!", log)
+ if options_storage.rna:
+ if options_storage.careful:
+ support.error("you cannot specify --careful in RNA-Seq mode!", log)
+ if options_storage.k_mers and options_storage.k_mers != 'auto' and len(options_storage.k_mers) > 1:
+ support.error("you cannot specify multiple k-mer sizes in RNA-Seq mode!", log)
if options_storage.continue_mode:
return None, None
@@ -385,6 +390,12 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
support.check_dataset_reads(dataset_data, options_storage.only_assembler, log)
if not support.get_lib_ids_by_type(dataset_data, spades_logic.READS_TYPES_USED_IN_CONSTRUCTION):
support.error('you should specify at least one unpaired, paired-end, or high-quality mate-pairs library!')
+ if options_storage.rna:
+ if len(dataset_data) != len(support.get_lib_ids_by_type(dataset_data, spades_logic.READS_TYPES_USED_IN_RNA_SEQ)):
+ support.error('you cannot specify any data types except ' +
+ ', '.join(spades_logic.READS_TYPES_USED_IN_RNA_SEQ) + ' in RNA-Seq mode!')
+ if len(support.get_lib_ids_by_type(dataset_data, 'paired-end')) > 1:
+ support.error('you cannot specify more than one paired-end library in RNA-Seq mode!')
options_storage.set_default_values()
### FILLING cfg
@@ -426,6 +437,8 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
options_storage.k_mers = None
if options_storage.k_mers:
cfg["assembly"].__dict__["iterative_K"] = options_storage.k_mers
+ elif options_storage.rna:
+ cfg["assembly"].__dict__["iterative_K"] = options_storage.K_MERS_RNA
else:
cfg["assembly"].__dict__["iterative_K"] = options_storage.K_MERS_SHORT
cfg["assembly"].__dict__["disable_rr"] = options_storage.disable_rr
@@ -688,6 +701,8 @@ def main(args):
result_assembly_graph_filename = os.path.join(cfg["common"].output_dir, options_storage.assembly_graph_name)
result_contigs_paths_filename = os.path.join(cfg["common"].output_dir, options_storage.contigs_paths)
result_scaffolds_paths_filename = os.path.join(cfg["common"].output_dir, options_storage.scaffolds_paths)
+ result_transcripts_filename = os.path.join(cfg["common"].output_dir, options_storage.transcripts_name)
+ result_transcripts_paths_filename = os.path.join(cfg["common"].output_dir, options_storage.transcripts_paths)
truseq_long_reads_file_base = os.path.join(cfg["common"].output_dir, "truseq_long_reads")
truseq_long_reads_file = truseq_long_reads_file_base + ".fasta"
misc_dir = os.path.join(cfg["common"].output_dir, "misc")
@@ -702,6 +717,8 @@ def main(args):
spades_cfg.__dict__["result_graph"] = result_assembly_graph_filename
spades_cfg.__dict__["result_contigs_paths"] = result_contigs_paths_filename
spades_cfg.__dict__["result_scaffolds_paths"] = result_scaffolds_paths_filename
+ spades_cfg.__dict__["result_transcripts"] = result_transcripts_filename
+ spades_cfg.__dict__["result_transcripts_paths"] = result_transcripts_paths_filename
if options_storage.continue_mode and (os.path.isfile(spades_cfg.result_contigs)
or ("mismatch_corrector" in cfg and
@@ -861,20 +878,29 @@ def main(args):
if "assembly" in cfg and os.path.isfile(result_contigs_filename):
message = " * Assembled contigs are in " + support.process_spaces(result_contigs_filename)
log.info(message)
- if "assembly" in cfg and os.path.isfile(result_scaffolds_filename):
- message = " * Assembled scaffolds are in " + support.process_spaces(result_scaffolds_filename)
- log.info(message)
- if "assembly" in cfg and os.path.isfile(result_assembly_graph_filename):
- message = " * Assembly graph is in " + support.process_spaces(result_assembly_graph_filename)
- log.info(message)
- if "assembly" in cfg and os.path.isfile(result_contigs_paths_filename):
- message = " * Paths in the assembly graph corresponding to the contigs are in " + \
- support.process_spaces(result_contigs_paths_filename)
- log.info(message)
- if "assembly" in cfg and os.path.isfile(result_scaffolds_paths_filename):
- message = " * Paths in the assembly graph corresponding to the scaffolds are in " + \
- support.process_spaces(result_scaffolds_paths_filename)
- log.info(message)
+ if options_storage.rna:
+ if "assembly" in cfg and os.path.isfile(result_transcripts_filename):
+ message = " * Assembled transcripts are in " + support.process_spaces(result_transcripts_filename)
+ log.info(message)
+ if "assembly" in cfg and os.path.isfile(result_transcripts_paths_filename):
+ message = " * Paths in the assembly graph corresponding to the transcripts are in " + \
+ support.process_spaces(result_transcripts_paths_filename)
+ log.info(message)
+ else:
+ if "assembly" in cfg and os.path.isfile(result_scaffolds_filename):
+ message = " * Assembled scaffolds are in " + support.process_spaces(result_scaffolds_filename)
+ log.info(message)
+ if "assembly" in cfg and os.path.isfile(result_assembly_graph_filename):
+ message = " * Assembly graph is in " + support.process_spaces(result_assembly_graph_filename)
+ log.info(message)
+ if "assembly" in cfg and os.path.isfile(result_contigs_paths_filename):
+ message = " * Paths in the assembly graph corresponding to the contigs are in " + \
+ support.process_spaces(result_contigs_paths_filename)
+ log.info(message)
+ if "assembly" in cfg and os.path.isfile(result_scaffolds_paths_filename):
+ message = " * Paths in the assembly graph corresponding to the scaffolds are in " + \
+ support.process_spaces(result_scaffolds_paths_filename)
+ log.info(message)
#log.info("")
#breaking scaffolds
@@ -898,6 +924,9 @@ def main(args):
if options_storage.truseq_mode:
if not os.path.isfile(truseq_long_reads_file):
support.error("TEST FAILED: %s does not exist!" % truseq_long_reads_file)
+ elif options_storage.rna:
+ if not os.path.isfile(result_transcripts_filename):
+ support.error("TEST FAILED: %s does not exist!" % result_transcripts_filename)
else:
for result_filename in [result_contigs_filename, result_scaffolds_filename]:
if os.path.isfile(result_filename):
diff --git a/metaspades.py b/rnaspades.py
similarity index 93%
copy from metaspades.py
copy to rnaspades.py
index 9f5034b..c19e2fb 100755
--- a/metaspades.py
+++ b/rnaspades.py
@@ -354,6 +354,11 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
if options_storage.meta:
if options_storage.careful or options_storage.mismatch_corrector or options_storage.cov_cutoff != "off":
support.error("you cannot specify --careful, --mismatch-correction or --cov-cutoff in metagenomic mode!", log)
+ if options_storage.rna:
+ if options_storage.careful:
+ support.error("you cannot specify --careful in RNA-Seq mode!", log)
+ if options_storage.k_mers and options_storage.k_mers != 'auto' and len(options_storage.k_mers) > 1:
+ support.error("you cannot specify multiple k-mer sizes in RNA-Seq mode!", log)
if options_storage.continue_mode:
return None, None
@@ -385,6 +390,12 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
support.check_dataset_reads(dataset_data, options_storage.only_assembler, log)
if not support.get_lib_ids_by_type(dataset_data, spades_logic.READS_TYPES_USED_IN_CONSTRUCTION):
support.error('you should specify at least one unpaired, paired-end, or high-quality mate-pairs library!')
+ if options_storage.rna:
+ if len(dataset_data) != len(support.get_lib_ids_by_type(dataset_data, spades_logic.READS_TYPES_USED_IN_RNA_SEQ)):
+ support.error('you cannot specify any data types except ' +
+ ', '.join(spades_logic.READS_TYPES_USED_IN_RNA_SEQ) + ' in RNA-Seq mode!')
+ if len(support.get_lib_ids_by_type(dataset_data, 'paired-end')) > 1:
+ support.error('you cannot specify more than one paired-end library in RNA-Seq mode!')
options_storage.set_default_values()
### FILLING cfg
@@ -426,6 +437,8 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
options_storage.k_mers = None
if options_storage.k_mers:
cfg["assembly"].__dict__["iterative_K"] = options_storage.k_mers
+ elif options_storage.rna:
+ cfg["assembly"].__dict__["iterative_K"] = options_storage.K_MERS_RNA
else:
cfg["assembly"].__dict__["iterative_K"] = options_storage.K_MERS_SHORT
cfg["assembly"].__dict__["disable_rr"] = options_storage.disable_rr
@@ -688,6 +701,8 @@ def main(args):
result_assembly_graph_filename = os.path.join(cfg["common"].output_dir, options_storage.assembly_graph_name)
result_contigs_paths_filename = os.path.join(cfg["common"].output_dir, options_storage.contigs_paths)
result_scaffolds_paths_filename = os.path.join(cfg["common"].output_dir, options_storage.scaffolds_paths)
+ result_transcripts_filename = os.path.join(cfg["common"].output_dir, options_storage.transcripts_name)
+ result_transcripts_paths_filename = os.path.join(cfg["common"].output_dir, options_storage.transcripts_paths)
truseq_long_reads_file_base = os.path.join(cfg["common"].output_dir, "truseq_long_reads")
truseq_long_reads_file = truseq_long_reads_file_base + ".fasta"
misc_dir = os.path.join(cfg["common"].output_dir, "misc")
@@ -702,6 +717,8 @@ def main(args):
spades_cfg.__dict__["result_graph"] = result_assembly_graph_filename
spades_cfg.__dict__["result_contigs_paths"] = result_contigs_paths_filename
spades_cfg.__dict__["result_scaffolds_paths"] = result_scaffolds_paths_filename
+ spades_cfg.__dict__["result_transcripts"] = result_transcripts_filename
+ spades_cfg.__dict__["result_transcripts_paths"] = result_transcripts_paths_filename
if options_storage.continue_mode and (os.path.isfile(spades_cfg.result_contigs)
or ("mismatch_corrector" in cfg and
@@ -861,20 +878,29 @@ def main(args):
if "assembly" in cfg and os.path.isfile(result_contigs_filename):
message = " * Assembled contigs are in " + support.process_spaces(result_contigs_filename)
log.info(message)
- if "assembly" in cfg and os.path.isfile(result_scaffolds_filename):
- message = " * Assembled scaffolds are in " + support.process_spaces(result_scaffolds_filename)
- log.info(message)
- if "assembly" in cfg and os.path.isfile(result_assembly_graph_filename):
- message = " * Assembly graph is in " + support.process_spaces(result_assembly_graph_filename)
- log.info(message)
- if "assembly" in cfg and os.path.isfile(result_contigs_paths_filename):
- message = " * Paths in the assembly graph corresponding to the contigs are in " + \
- support.process_spaces(result_contigs_paths_filename)
- log.info(message)
- if "assembly" in cfg and os.path.isfile(result_scaffolds_paths_filename):
- message = " * Paths in the assembly graph corresponding to the scaffolds are in " + \
- support.process_spaces(result_scaffolds_paths_filename)
- log.info(message)
+ if options_storage.rna:
+ if "assembly" in cfg and os.path.isfile(result_transcripts_filename):
+ message = " * Assembled transcripts are in " + support.process_spaces(result_transcripts_filename)
+ log.info(message)
+ if "assembly" in cfg and os.path.isfile(result_transcripts_paths_filename):
+ message = " * Paths in the assembly graph corresponding to the transcripts are in " + \
+ support.process_spaces(result_transcripts_paths_filename)
+ log.info(message)
+ else:
+ if "assembly" in cfg and os.path.isfile(result_scaffolds_filename):
+ message = " * Assembled scaffolds are in " + support.process_spaces(result_scaffolds_filename)
+ log.info(message)
+ if "assembly" in cfg and os.path.isfile(result_assembly_graph_filename):
+ message = " * Assembly graph is in " + support.process_spaces(result_assembly_graph_filename)
+ log.info(message)
+ if "assembly" in cfg and os.path.isfile(result_contigs_paths_filename):
+ message = " * Paths in the assembly graph corresponding to the contigs are in " + \
+ support.process_spaces(result_contigs_paths_filename)
+ log.info(message)
+ if "assembly" in cfg and os.path.isfile(result_scaffolds_paths_filename):
+ message = " * Paths in the assembly graph corresponding to the scaffolds are in " + \
+ support.process_spaces(result_scaffolds_paths_filename)
+ log.info(message)
#log.info("")
#breaking scaffolds
@@ -898,6 +924,9 @@ def main(args):
if options_storage.truseq_mode:
if not os.path.isfile(truseq_long_reads_file):
support.error("TEST FAILED: %s does not exist!" % truseq_long_reads_file)
+ elif options_storage.rna:
+ if not os.path.isfile(result_transcripts_filename):
+ support.error("TEST FAILED: %s does not exist!" % result_transcripts_filename)
else:
for result_filename in [result_contigs_filename, result_scaffolds_filename]:
if os.path.isfile(result_filename):
diff --git a/rnaspades_manual.html b/rnaspades_manual.html
new file mode 100644
index 0000000..a072965
--- /dev/null
+++ b/rnaspades_manual.html
@@ -0,0 +1,84 @@
+<html>
+<head>
+ <title>rnaSPAdes 1.0.0 Manual</title>
+ <style type="text/css">
+ .code {
+ background-color: lightgray;
+ }
+ </style>
+</head>
+<body>
+<h1>rnaSPAdes 1.0.0 Manual</h1>
+
+1. <a href="#sec1">About rnaSPAdes</a><br>
+2. <a href="#sec2">rnaSPAdes specifics</a><br>
+3. <a href="#sec3">Assembly evaluation</a><br>
+4. <a href="#sec4">Citation</a><br>
+5. <a href="#sec5">Feedback and bug reports</a><br>
+
+<a name="sec1"></a>
+<h2>1 About rnaSPAdes</h2>
+
+<p> rnaSPAdes is a tool for <i>de novo</i> transcriptome assembly from RNA-Seq data and is suitable for all kind of organisms. rnaSPAdes is a part of <a href="http://bioinf.spbau.ru/en/spades">SPAdes package</a> since version 3.9. Information about SPAdes download, requirements, installation and basic options can be found in <a href="manual.html">SPAdes manual</a>. Below you may find information about differences between SPAdes and rnaSPAdes.
+
+<a name="sec2"></a>
+<h2>2 rnaSPAdes specifics</h2>
+
+<p>
+To run rnaSPAdes use
+
+<pre class="code">
+<code>
+ rnaspades.py [options] -o <output_dir>
+</code>
+</pre>
+
+or
+
+<pre class="code">
+<code>
+ spades.py --rna [options] -o <output_dir>
+</code>
+</pre>
+
+Note that we assume that SPAdes installation directory is added to the <code>PATH</code> variable (provide full path to rnaSPAdes executable otherwise: <code><rnaspades installation dir>/rnaspades.py</code>).
+
+
+<p>Here are the main differences of rnaSPAdes:
+ <ul>
+ <li>rnaSPAdes outputs only one FASTA file named <code>transcripts.fasta</code>. The corresponding file with paths in the <code>assembly_graph.fastg</code> is <code>transcripts.paths</code>.</li>
+ <li>rnaSPAdes can take as an input only one paired-end library and multiple single-end libraries.</li>
+ <li>rnaSPAdes does not support <code>--careful</code> and <code>--cov-cutoff</code> options.</li>
+ <li>rnaSPAdes is not compatible with other pipeline options such as <code>--meta</code>, <code>--sc</code> and <code>--plasmid</code>.</li>
+ <li>rnaSPAdes works using only a single k-mer size (55 by the default). We strongly recommend no to change this parameter. In case your RNA-Seq data set contains long Illumina reads (150 bp and longer) you may try to use longer k-mer size (approximately half of the read length). In case you have any doubts about your run, do not hesitate to contact us using e-mail given below.</li>
+ </ul>
+
+
+<a name="sec3">
+<h2>3 Assembly evaluation</h2>
+
+<p>
+ <a href="http://bioinf.spbau.ru/en/rnaquast" target="_blank">rnaQUAST</a> may be used for transcriptome assembly quality assessment for model organisms when reference genome and gene database are available. rnaQUAST also includes <a href="http://busco.ezlab.org/" target="_blank">BUSCO</a> and <a href="http://topaz.gatech.edu/GeneMark/" target="_blank"> GeneMarkS-T</a> tools for <i>de novo</i> evaluation.
+<br>
+
+<a name="sec4">
+<h2>4 Citation</h2>
+<p>
+If you use rnaSPAdes in your research, please include main SPAdes paper <a href="http://online.liebertpub.com/doi/abs/10.1089/cmb.2012.0021" target="_blank">Bankevich, Nurk et al., 2012</a> in your reference list. Paper on rnaSPAdes is to be submitted.
+
+
+<a name="sec5">
+<h2>5 Feedback and bug reports</h2>
+<p>
+ Your comments, bug reports, and suggestions are very welcomed. They will help us to further improve rnaSPAdes.
+
+<p>
+ If you have any troubles running rnaSPAdes, please send us <code>params.txt</code> and <code>spades.log</code> from the directory <code><output_dir></code>.
+
+<p>
+ Address for communications: <a href="mailto:spades.support at bioinf.spbau.ru" target="_blank">spades.support at bioinf.spbau.ru</a>.
+
+<br/><br/><br/><br/><br/>
+
+</body>
+</html>
diff --git a/spades.py b/spades.py
index 9f5034b..c19e2fb 100755
--- a/spades.py
+++ b/spades.py
@@ -354,6 +354,11 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
if options_storage.meta:
if options_storage.careful or options_storage.mismatch_corrector or options_storage.cov_cutoff != "off":
support.error("you cannot specify --careful, --mismatch-correction or --cov-cutoff in metagenomic mode!", log)
+ if options_storage.rna:
+ if options_storage.careful:
+ support.error("you cannot specify --careful in RNA-Seq mode!", log)
+ if options_storage.k_mers and options_storage.k_mers != 'auto' and len(options_storage.k_mers) > 1:
+ support.error("you cannot specify multiple k-mer sizes in RNA-Seq mode!", log)
if options_storage.continue_mode:
return None, None
@@ -385,6 +390,12 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
support.check_dataset_reads(dataset_data, options_storage.only_assembler, log)
if not support.get_lib_ids_by_type(dataset_data, spades_logic.READS_TYPES_USED_IN_CONSTRUCTION):
support.error('you should specify at least one unpaired, paired-end, or high-quality mate-pairs library!')
+ if options_storage.rna:
+ if len(dataset_data) != len(support.get_lib_ids_by_type(dataset_data, spades_logic.READS_TYPES_USED_IN_RNA_SEQ)):
+ support.error('you cannot specify any data types except ' +
+ ', '.join(spades_logic.READS_TYPES_USED_IN_RNA_SEQ) + ' in RNA-Seq mode!')
+ if len(support.get_lib_ids_by_type(dataset_data, 'paired-end')) > 1:
+ support.error('you cannot specify more than one paired-end library in RNA-Seq mode!')
options_storage.set_default_values()
### FILLING cfg
@@ -426,6 +437,8 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
options_storage.k_mers = None
if options_storage.k_mers:
cfg["assembly"].__dict__["iterative_K"] = options_storage.k_mers
+ elif options_storage.rna:
+ cfg["assembly"].__dict__["iterative_K"] = options_storage.K_MERS_RNA
else:
cfg["assembly"].__dict__["iterative_K"] = options_storage.K_MERS_SHORT
cfg["assembly"].__dict__["disable_rr"] = options_storage.disable_rr
@@ -688,6 +701,8 @@ def main(args):
result_assembly_graph_filename = os.path.join(cfg["common"].output_dir, options_storage.assembly_graph_name)
result_contigs_paths_filename = os.path.join(cfg["common"].output_dir, options_storage.contigs_paths)
result_scaffolds_paths_filename = os.path.join(cfg["common"].output_dir, options_storage.scaffolds_paths)
+ result_transcripts_filename = os.path.join(cfg["common"].output_dir, options_storage.transcripts_name)
+ result_transcripts_paths_filename = os.path.join(cfg["common"].output_dir, options_storage.transcripts_paths)
truseq_long_reads_file_base = os.path.join(cfg["common"].output_dir, "truseq_long_reads")
truseq_long_reads_file = truseq_long_reads_file_base + ".fasta"
misc_dir = os.path.join(cfg["common"].output_dir, "misc")
@@ -702,6 +717,8 @@ def main(args):
spades_cfg.__dict__["result_graph"] = result_assembly_graph_filename
spades_cfg.__dict__["result_contigs_paths"] = result_contigs_paths_filename
spades_cfg.__dict__["result_scaffolds_paths"] = result_scaffolds_paths_filename
+ spades_cfg.__dict__["result_transcripts"] = result_transcripts_filename
+ spades_cfg.__dict__["result_transcripts_paths"] = result_transcripts_paths_filename
if options_storage.continue_mode and (os.path.isfile(spades_cfg.result_contigs)
or ("mismatch_corrector" in cfg and
@@ -861,20 +878,29 @@ def main(args):
if "assembly" in cfg and os.path.isfile(result_contigs_filename):
message = " * Assembled contigs are in " + support.process_spaces(result_contigs_filename)
log.info(message)
- if "assembly" in cfg and os.path.isfile(result_scaffolds_filename):
- message = " * Assembled scaffolds are in " + support.process_spaces(result_scaffolds_filename)
- log.info(message)
- if "assembly" in cfg and os.path.isfile(result_assembly_graph_filename):
- message = " * Assembly graph is in " + support.process_spaces(result_assembly_graph_filename)
- log.info(message)
- if "assembly" in cfg and os.path.isfile(result_contigs_paths_filename):
- message = " * Paths in the assembly graph corresponding to the contigs are in " + \
- support.process_spaces(result_contigs_paths_filename)
- log.info(message)
- if "assembly" in cfg and os.path.isfile(result_scaffolds_paths_filename):
- message = " * Paths in the assembly graph corresponding to the scaffolds are in " + \
- support.process_spaces(result_scaffolds_paths_filename)
- log.info(message)
+ if options_storage.rna:
+ if "assembly" in cfg and os.path.isfile(result_transcripts_filename):
+ message = " * Assembled transcripts are in " + support.process_spaces(result_transcripts_filename)
+ log.info(message)
+ if "assembly" in cfg and os.path.isfile(result_transcripts_paths_filename):
+ message = " * Paths in the assembly graph corresponding to the transcripts are in " + \
+ support.process_spaces(result_transcripts_paths_filename)
+ log.info(message)
+ else:
+ if "assembly" in cfg and os.path.isfile(result_scaffolds_filename):
+ message = " * Assembled scaffolds are in " + support.process_spaces(result_scaffolds_filename)
+ log.info(message)
+ if "assembly" in cfg and os.path.isfile(result_assembly_graph_filename):
+ message = " * Assembly graph is in " + support.process_spaces(result_assembly_graph_filename)
+ log.info(message)
+ if "assembly" in cfg and os.path.isfile(result_contigs_paths_filename):
+ message = " * Paths in the assembly graph corresponding to the contigs are in " + \
+ support.process_spaces(result_contigs_paths_filename)
+ log.info(message)
+ if "assembly" in cfg and os.path.isfile(result_scaffolds_paths_filename):
+ message = " * Paths in the assembly graph corresponding to the scaffolds are in " + \
+ support.process_spaces(result_scaffolds_paths_filename)
+ log.info(message)
#log.info("")
#breaking scaffolds
@@ -898,6 +924,9 @@ def main(args):
if options_storage.truseq_mode:
if not os.path.isfile(truseq_long_reads_file):
support.error("TEST FAILED: %s does not exist!" % truseq_long_reads_file)
+ elif options_storage.rna:
+ if not os.path.isfile(result_transcripts_filename):
+ support.error("TEST FAILED: %s does not exist!" % result_transcripts_filename)
else:
for result_filename in [result_contigs_filename, result_scaffolds_filename]:
if os.path.isfile(result_filename):
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index fd7ad2e..6ef1d66 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -100,9 +100,9 @@ install(PROGRAMS "${CMAKE_CURRENT_SOURCE_DIR}/../metaspades.py"
install(PROGRAMS "${CMAKE_CURRENT_SOURCE_DIR}/../plasmidspades.py"
DESTINATION bin
COMPONENT runtime)
-#install(PROGRAMS "${CMAKE_CURRENT_SOURCE_DIR}/../rnaspades.py"
-# DESTINATION bin
-# COMPONENT runtime)
+install(PROGRAMS "${CMAKE_CURRENT_SOURCE_DIR}/../rnaspades.py"
+ DESTINATION bin
+ COMPONENT runtime)
install(PROGRAMS "${CMAKE_CURRENT_SOURCE_DIR}/../truspades.py"
DESTINATION bin
COMPONENT runtime)
@@ -129,6 +129,9 @@ install(FILES "${CMAKE_CURRENT_SOURCE_DIR}/../dipspades_manual.html"
install(FILES "${CMAKE_CURRENT_SOURCE_DIR}/../truspades_manual.html"
DESTINATION share/spades
COMPONENT runtime)
+install(FILES "${CMAKE_CURRENT_SOURCE_DIR}/../rnaspades_manual.html"
+ DESTINATION share/spades
+ COMPONENT runtime)
install(FILES "${CMAKE_CURRENT_SOURCE_DIR}/../LICENSE"
DESTINATION share/spades
COMPONENT runtime)
diff --git a/src/cmake/pack.cmake b/src/cmake/pack.cmake
index 6b72b71..318c2d0 100644
--- a/src/cmake/pack.cmake
+++ b/src/cmake/pack.cmake
@@ -12,10 +12,10 @@ set(CPACK_PACKAGE_NAME "SPAdes")
set(CPACK_PACKAGE_VENDOR "Saint Petersburg Academic University")
set(CPACK_PACKAGE_DESCRIPTION_FILE "${SPADES_MAIN_SRC_DIR}/../README")
set(CPACK_RESOURCE_FILE_LICENSE "${SPADES_MAIN_SRC_DIR}/../LICENSE")
-set(CPACK_PACKAGE_VERSION "3.8.2")
+set(CPACK_PACKAGE_VERSION "3.9.0")
set(CPACK_PACKAGE_VERSION_MAJOR "3")
-set(CPACK_PACKAGE_VERSION_MINOR "8")
-set(CPACK_PACKAGE_VERSION_PATCH "2")
+set(CPACK_PACKAGE_VERSION_MINOR "9")
+set(CPACK_PACKAGE_VERSION_PATCH "0")
set(CPACK_STRIP_FILES bin/spades bin/hammer bin/ionhammer bin/dipspades bin/spades-bwa bin/corrector bin/scaffold_correction)
# Source stuff
diff --git a/src/modules/algorithms/genome_consistance_checker.cpp b/src/modules/algorithms/genome_consistance_checker.cpp
index c980d7c..f3009ad 100644
--- a/src/modules/algorithms/genome_consistance_checker.cpp
+++ b/src/modules/algorithms/genome_consistance_checker.cpp
@@ -95,7 +95,7 @@ PathScore GenomeConsistenceChecker::CountMisassembliesWithStrand(const Bidirecti
int dist_in_path = (int) path.LengthAt(prev_in_path) - (int) path.LengthAt(i) + (int) cur_range.mapped_range.start_pos - (int) prev_range.mapped_range.end_pos;
DEBUG("Edge " << prev.int_id() << " position in genome ordering: " << prev_in_genome);
DEBUG("Gap in genome / gap in path: " << dist_in_genome << " / " << dist_in_path);
- if (abs(dist_in_genome - dist_in_path) > absolute_max_gap_ && (dist_in_genome * (1 + relative_max_gap_) < dist_in_path || dist_in_path * (1 + relative_max_gap_) < dist_in_genome)) {
+ if (size_t(abs(dist_in_genome - dist_in_path)) > absolute_max_gap_ && (dist_in_genome * (1 + relative_max_gap_) < dist_in_path || dist_in_path * (1 + relative_max_gap_) < dist_in_genome)) {
res.wrong_gap_size ++;
}
diff --git a/src/modules/algorithms/graph_construction.hpp b/src/modules/algorithms/graph_construction.hpp
index ce32a7e..d7034e6 100644
--- a/src/modules/algorithms/graph_construction.hpp
+++ b/src/modules/algorithms/graph_construction.hpp
@@ -110,11 +110,12 @@ void EarlyClipTips(size_t k, const config::debruijn_config::construction& params
}
}
+#include "data_structures/indices/kmer_extension_index_builder.hpp"
+
template<class Graph, class Read, class Index>
ReadStatistics ConstructGraphUsingExtentionIndex(const config::debruijn_config::construction params,
- io::ReadStreamList<Read>& streams, Graph& g,
- Index& index, io::SingleStreamPtr contigs_stream = io::SingleStreamPtr()) {
-
+ io::ReadStreamList<Read>& streams, Graph& g,
+ Index& index, io::SingleStreamPtr contigs_stream = io::SingleStreamPtr()) {
size_t k = g.k();
INFO("Constructing DeBruijn graph for k=" << k);
@@ -167,7 +168,7 @@ ReadStatistics ConstructGraphWithCoverage(const config::debruijn_config::constru
io::SingleStreamPtr contigs_stream = io::SingleStreamPtr()) {
ReadStatistics rs = ConstructGraph(params, streams, g, index, contigs_stream);
- typedef typename Index::InnerIndexT InnerIndex;
+ typedef typename Index::InnerIndex InnerIndex;
typedef typename EdgeIndexHelper<InnerIndex>::CoverageAndGraphPositionFillingIndexBuilderT IndexBuilder;
INFO("Filling coverage index")
IndexBuilder().ParallelFillCoverage(index.inner_index(), streams);
diff --git a/src/modules/algorithms/mismatch_shall_not_pass.hpp b/src/modules/algorithms/mismatch_shall_not_pass.hpp
index ed08660..0451adb 100644
--- a/src/modules/algorithms/mismatch_shall_not_pass.hpp
+++ b/src/modules/algorithms/mismatch_shall_not_pass.hpp
@@ -11,6 +11,7 @@
#include "assembly_graph/handlers/id_track_handler.hpp"
#include "dev_support/logger/logger.hpp"
+#include "io/reads_io/read_stream_vector.hpp"
#include "data_structures/sequence/runtime_k.hpp"
#include "assembly_graph/graph_alignment/sequence_mapper.hpp"
@@ -79,8 +80,10 @@ private:
void CollectPotensialMismatches(const graph_pack &gp) {
auto &kmer_mapper = gp.kmer_mapper;
for (auto it = kmer_mapper.begin(); it != kmer_mapper.end(); ++it) {
- runtime_k::RtSeq from = it->first;
- runtime_k::RtSeq to = it->second;
+ // Kmer mapper iterator dereferences to pair (KMer, KMer), not to the reference!
+ const auto mentry = *it;
+ const runtime_k::RtSeq &from = mentry.first;
+ const runtime_k::RtSeq &to = mentry.second;
size_t cnt = 0;
size_t cnt_arr[4];
for (size_t i = 0; i < 4; i++)
@@ -104,13 +107,6 @@ private:
}
}
}
- for (auto it = gp.g.ConstEdgeBegin(); !it.IsEnd(); ++it) {
- if (gp.g.length(*it) < cfg::get().max_repeat_length) {
- // INFO("edge id " <<gp.g.int_id(*it) << " added to stat" );
- // for(size_t i = 0; i < gp.g.length(*it) + gp.g.k(); i++)
- // statistics_[*it].AddPosition(i);
- }
- }
}
void operator+=(const MismatchStatistics<EdgeId> &other) {
@@ -204,7 +200,6 @@ private:
typedef typename graph_pack::graph_t Graph;
typedef typename Graph::EdgeId EdgeId;
typedef typename Graph::VertexId VertexId;
- typedef runtime_k::RtSeq Kmer;
graph_pack &gp_;
double relative_threshold_;
diff --git a/src/modules/algorithms/path_extend/extension_chooser.hpp b/src/modules/algorithms/path_extend/extension_chooser.hpp
index b00f944..b0a989a 100644
--- a/src/modules/algorithms/path_extend/extension_chooser.hpp
+++ b/src/modules/algorithms/path_extend/extension_chooser.hpp
@@ -517,9 +517,11 @@ protected:
for (size_t j = 0; j < histogram.size(); ++j) {
sum += histogram[j].second;
}
- if (sum <= cl_weight_threshold_) {
+ DEBUG("Weight for scaffolding = " << sum << ", threshold = " << cl_weight_threshold_)
+ if (math::ls(sum, cl_weight_threshold_)) {
continue;
}
+
int gap = CountMean(histogram);
if (HasIdealInfo(path, e, gap)) {
DEBUG("scaffolding " << g_.int_id(e) << " gap " << gap);
@@ -555,9 +557,12 @@ protected:
public:
- ScaffoldingExtensionChooser(const Graph& g, shared_ptr<WeightCounter> wc, double is_scatter_coeff) :
+
+ ScaffoldingExtensionChooser(const Graph& g, shared_ptr<WeightCounter> wc,
+ double cl_weight_threshold,
+ double is_scatter_coeff) :
ExtensionChooser(g, wc), raw_weight_threshold_(0.0),
- cl_weight_threshold_(cfg::get().pe_params.param_set.scaffolder_options.cl_threshold),
+ cl_weight_threshold_(cl_weight_threshold),
is_scatter_coeff_(is_scatter_coeff) {
}
@@ -570,6 +575,7 @@ public:
FindBestFittedEdgesForClustered(path, candidates, result);
return result;
}
+
private:
DECL_LOGGER("ScaffoldingExtensionChooser");
};
@@ -584,12 +590,15 @@ private:
DECL_LOGGER("LongReadsUniqueEdgeAnalyzer")
public:
LongReadsUniqueEdgeAnalyzer(const Graph& g, const GraphCoverageMap& cov_map,
- double filter_threshold, double prior_threshold, size_t max_repeat_length)
+ double filter_threshold, double prior_threshold,
+ size_t max_repeat_length, bool uneven_depth)
: g_(g),
cov_map_(cov_map),
filter_threshold_(filter_threshold),
prior_threshold_(prior_threshold),
- max_repeat_length_(max_repeat_length) {
+ max_repeat_length_(max_repeat_length),
+ uneven_depth_(uneven_depth) {
+
FindAllUniqueEdges();
}
@@ -710,15 +719,13 @@ private:
}
void FindAllUniqueCoverageEdges() {
- if (cfg::get().uneven_depth) {
- return;
- }
+ VERIFY(!uneven_depth_);
double sum_cov = 0;
size_t sum_len = 0;
size_t total_len = 0;
for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
total_len += g_.length(*iter);
- if (g_.length(*iter) >= cfg::get().max_repeat_length) {
+ if (g_.length(*iter) >= max_repeat_length_) {
sum_cov += g_.coverage(*iter) * (double)g_.length(*iter);
sum_len += g_.length(*iter);
}
@@ -739,16 +746,17 @@ private:
void FindAllUniqueEdges() {
- DEBUG("Looking for unique edges");
- for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
- if (UniqueEdge(*iter)) {
- unique_edges_.insert(*iter);
- unique_edges_.insert(g_.conjugate(*iter));
- }
- }
- DEBUG("coverage based uniqueness started");
- FindAllUniqueCoverageEdges();
- DEBUG("Unique edges are found");
+ DEBUG("Looking for unique edges");
+ for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
+ if (UniqueEdge(*iter)) {
+ unique_edges_.insert(*iter);
+ unique_edges_.insert(g_.conjugate(*iter));
+ }
+ }
+ DEBUG("coverage based uniqueness started");
+ if (!uneven_depth_)
+ FindAllUniqueCoverageEdges();
+ DEBUG("Unique edges are found");
}
const Graph& g_;
@@ -757,6 +765,7 @@ private:
double prior_threshold_;
std::set<EdgeId> unique_edges_;
size_t max_repeat_length_;
+ bool uneven_depth_;
};
class SimpleScaffolding {
@@ -812,13 +821,16 @@ public:
double weight_priority_threshold,
double unique_edge_priority_threshold,
size_t min_significant_overlap,
- size_t max_repeat_length)
+ size_t max_repeat_length,
+ bool uneven_depth)
: ExtensionChooser(g),
filtering_threshold_(filtering_threshold),
weight_priority_threshold_(weight_priority_threshold),
min_significant_overlap_(min_significant_overlap),
cov_map_(g, pc),
- unique_edge_analyzer_(g, cov_map_, filtering_threshold, unique_edge_priority_threshold, max_repeat_length),
+ unique_edge_analyzer_(g, cov_map_, filtering_threshold,
+ unique_edge_priority_threshold,
+ max_repeat_length, uneven_depth),
simple_scaffolding_(g) {
}
@@ -925,7 +937,8 @@ private:
class MatePairExtensionChooser : public ExtensionChooser {
public:
MatePairExtensionChooser(const Graph& g, shared_ptr<PairedInfoLibrary> lib,
- const PathContainer& paths, size_t max_number_of_paths_to_search)
+ const PathContainer& paths, size_t max_number_of_paths_to_search,
+ bool uneven_depth)
: ExtensionChooser(g),
g_(g),
lib_(lib),
@@ -933,7 +946,8 @@ public:
weight_counter_(g, lib, 10),
cov_map_(g_, paths),
path_searcher_(g_, cov_map_, lib_->GetISMax(), PathsWeightCounter(g, lib, (size_t) lib->GetSingleThreshold()), max_number_of_paths_to_search),
- unique_edge_analyzer_(g, cov_map_, 0., 1000., 8000.),
+ //TODO params
+ unique_edge_analyzer_(g, cov_map_, 0., 1000., 8000., uneven_depth),
simple_scaffolder_(g) {
}
diff --git a/src/modules/algorithms/path_extend/overlap_analysis.hpp b/src/modules/algorithms/path_extend/overlap_analysis.hpp
index d773b57..b119a7d 100644
--- a/src/modules/algorithms/path_extend/overlap_analysis.hpp
+++ b/src/modules/algorithms/path_extend/overlap_analysis.hpp
@@ -86,28 +86,38 @@ public:
SWOverlapAnalyzer(size_t flank_length)
: flank_length_(flank_length),
aligner_(/*match_score*/1,
- /*mismatch_penalty*/3,
+ /*mismatch_penalty*/3,
/*gap_opening_penalty*/4,
/*gap_extending_penalty*/3) {
}
OverlapInfo AnalyzeOverlap(const Sequence& s1, const Sequence& s2) const {
+ DEBUG("Analysis started");
size_t start1 = flank_length_ > s1.size() ? 0 : s1.size() - flank_length_;
size_t end2 = flank_length_ > s2.size() ? s2.size() : flank_length_;
+ DEBUG("s1 " << s1.Subseq(start1, s1.size()));
+ DEBUG("s2 " << s2.Subseq(0, end2));
OverlapInfo result = InnerAnalyze(s1.Subseq(start1, s1.size()), s2.Subseq(0, end2));
- if (result == OverlapInfo())
+ if (result == OverlapInfo()) {
+ DEBUG("Empty overlap")
return result;
+ }
result.r1.shift(int(start1));
+ DEBUG("Result " << result)
return result;
}
template<class Graph>
OverlapInfo AnalyzeOverlap(const Graph& g, EdgeId e1, EdgeId e2) const {
+ DEBUG("Analyzing edges " << g.str(e1) << " and " << g.str(e2));
return AnalyzeOverlap(g.EdgeNucls(e1), g.EdgeNucls(e2));
}
+
+private:
+ DECL_LOGGER("SWOverlapAnalyzer");
};
}
diff --git a/src/modules/algorithms/path_extend/path_extend_launch.hpp b/src/modules/algorithms/path_extend/path_extend_launch.hpp
index 5b11bc7..ba1d4e3 100644
--- a/src/modules/algorithms/path_extend/path_extend_launch.hpp
+++ b/src/modules/algorithms/path_extend/path_extend_launch.hpp
@@ -32,41 +32,76 @@
namespace path_extend {
using namespace debruijn_graph;
-typedef omnigraph::de::PairedInfoIndicesT<Graph> PairedInfoIndicesT;
-inline size_t FindMaxOverlapedLen(const vector<shared_ptr<PairedInfoLibrary> >& libs) {
- size_t max = 0;
- for (size_t i = 0; i < libs.size(); ++i) {
- max = std::max(libs[i]->GetISMax(), max);
+struct PathExtendParamsContainer {
+
+ PathExtendParamsContainer(const pe_config::MainPEParamsT& pe_cfg_,
+ const std::string& output_dir_,
+ const std::string& contigs_name_,
+ const std::string& scf_name_,
+ config::pipeline_type mode_,
+ bool uneven_depth_,
+ bool avoid_rc_connections_,
+ bool use_scaffolder_,
+ bool output_broken_scaffolds_ = true):
+ pe_cfg(pe_cfg_),
+ pset(pe_cfg_.param_set),
+ output_dir(output_dir_),
+ etc_dir(output_dir + pe_cfg_.etc_dir + "/"),
+ contigs_name(scf_name_),
+ broken_contigs(contigs_name_),
+ mode(mode_),
+ uneven_depth(uneven_depth_),
+ avoid_rc_connections(avoid_rc_connections_),
+ use_scaffolder(use_scaffolder_),
+ traverse_loops(true),
+ output_broken_scaffolds(output_broken_scaffolds_)
+ {
+ if (!(use_scaffolder && pset.scaffolder_options.enabled)) {
+ contigs_name = contigs_name_;
+ traverse_loops = false;
+ output_broken_scaffolds = false;
+ }
}
- return max;
-}
-inline string GetEtcDir(const std::string& output_dir) {
- return output_dir + cfg::get().pe_params.etc_dir + "/";
-}
+ const pe_config::MainPEParamsT& pe_cfg;
+ const pe_config::ParamSetT& pset;
+
+ std::string output_dir;
+ std::string etc_dir;
+
+ std::string contigs_name;
+ std::string broken_contigs;
+
+ config::pipeline_type mode;
+ bool uneven_depth;
+
+ bool avoid_rc_connections;
+ bool use_scaffolder;
+ bool traverse_loops;
+ bool output_broken_scaffolds;
+};
inline void DebugOutputPaths(const conj_graph_pack& gp,
- const std::string& output_dir, const PathContainer& paths,
- const string& name) {
+ const PathExtendParamsContainer& params,
+ const PathContainer& paths,
+ const string& name) {
PathInfoWriter path_writer;
PathVisualizer visualizer;
DefaultContigCorrector<ConjugateDeBruijnGraph> corrector(gp.g);
DefaultContigConstructor<ConjugateDeBruijnGraph> constructor(gp.g, corrector);
- ContigWriter writer(gp.g, constructor, gp.components);
+ ContigWriter writer(gp.g, constructor, gp.components, params.mode == config::pipeline_type::plasmid);
- string etcDir = GetEtcDir(output_dir);
- if (!cfg::get().pe_params.debug_output) {
+ if (!params.pe_cfg.debug_output) {
return;
}
- writer.OutputPaths(paths, etcDir + name);
- if (cfg::get().pe_params.output.write_paths) {
- path_writer.WritePaths(paths, etcDir + name + ".dat");
+ writer.OutputPaths(paths, params.etc_dir + name);
+ if (params.pe_cfg.output.write_paths) {
+ path_writer.WritePaths(paths, params.etc_dir + name + ".dat");
}
- if (cfg::get().pe_params.viz.print_paths) {
- visualizer.writeGraphWithPathsSimple(gp, etcDir + name + ".dot", name,
- paths);
+ if (params.pe_cfg.viz.print_paths) {
+ visualizer.writeGraphWithPathsSimple(gp, params.etc_dir + name + ".dot", name, paths);
}
}
@@ -92,20 +127,18 @@ inline void SetSingleThresholdForLib(shared_ptr<PairedInfoLibrary> lib, const pe
}
-inline string MakeNewName(const std::string& contigs_name, const std::string& subname) {
- return contigs_name.substr(0, contigs_name.rfind(".fasta")) + "_" + subname + ".fasta";
-}
-
-inline void OutputBrokenScaffolds(PathContainer& paths, int k,
- const ContigWriter& writer,
- const std::string& filename) {
- if (!cfg::get().pe_params.param_set.scaffolder_options.on
- or !cfg::get().use_scaffolder
- or cfg::get().pe_params.obs == obs_none) {
+inline void OutputBrokenScaffolds(PathContainer& paths,
+ const PathExtendParamsContainer& params,
+ int k,
+ const ContigWriter& writer,
+ const std::string& filename) {
+ if (!params.pset.scaffolder_options.enabled
+ || !params.use_scaffolder
+ || params.pe_cfg.obs == obs_none) {
return;
}
- int min_gap = cfg::get().pe_params.obs == obs_break_all ? k / 2 : k;
+ int min_gap = params.pe_cfg.obs == obs_break_all ? k / 2 : k;
ScaffoldBreaker breaker(min_gap, paths);
breaker.container().SortByLength();
@@ -130,8 +163,8 @@ inline void AddPathsToContainer(const conj_graph_pack& gp,
DEBUG("Long reads paths " << result.size() << " == ");
}
-bool HasOnlyMPLibs() {
- for (const auto& lib : cfg::get().ds.reads) {
+bool HasOnlyMPLibs(const config::dataset& dataset_info) {
+ for (const auto& lib : dataset_info.reads) {
if (!((lib.type() == io::LibraryType::MatePairs || lib.type() == io::LibraryType::HQMatePairs) &&
lib.data().mean_insert_size > 0.0)) {
return false;
@@ -140,8 +173,9 @@ bool HasOnlyMPLibs() {
return true;
}
-bool UseCoverageResolverForSingleReads(const io::LibraryType& type) {
- return HasOnlyMPLibs() && (type == io::LibraryType::HQMatePairs);
+bool UseCoverageResolverForSingleReads(const config::dataset& dataset_info,
+ const io::LibraryType& type) {
+ return HasOnlyMPLibs(dataset_info) && (type == io::LibraryType::HQMatePairs);
}
inline size_t CountEdgesInGraph(const Graph& g) {
@@ -186,12 +220,19 @@ inline void ClonePathContainer(PathContainer& spaths, PathContainer& tpaths, Gra
}
}
-inline void FinalizePaths(PathContainer& paths, GraphCoverageMap& cover_map, size_t min_edge_len, size_t max_path_diff, bool mate_pairs = false) {
+inline void FinalizePaths(const PathExtendParamsContainer& params,
+ PathContainer& paths,
+ const Graph& g,
+ GraphCoverageMap& cover_map,
+ size_t min_edge_len,
+ size_t max_path_diff,
+ bool mate_pairs = false) {
PathExtendResolver resolver(cover_map.graph());
-
- if (cfg::get().pe_params.param_set.remove_overlaps) {
- resolver.removeOverlaps(paths, cover_map, min_edge_len, max_path_diff, cfg::get().pe_params.param_set.cut_all_overlaps);
+ if (params.pset.remove_overlaps) {
+ resolver.removeOverlaps(paths, cover_map, min_edge_len, max_path_diff,
+ params.pset.cut_all_overlaps,
+ (params.mode == config::pipeline_type::moleculo));
}
else {
resolver.removeEqualPaths(paths, cover_map, min_edge_len);
@@ -199,13 +240,18 @@ inline void FinalizePaths(PathContainer& paths, GraphCoverageMap& cover_map, siz
if (mate_pairs) {
resolver.RemoveMatePairEnds(paths, min_edge_len);
}
- if (cfg::get().avoid_rc_connections) {
+ if (params.avoid_rc_connections) {
paths.FilterInterstandBulges();
}
paths.FilterEmptyPaths();
if (!mate_pairs) {
resolver.addUncoveredEdges(paths, cover_map);
}
+ if (params.pset.path_filtration.enabled) {
+ LengthPathFilter(g, params.pset.path_filtration.min_length).filter(paths);;
+ IsolatedPathFilter(g, params.pset.path_filtration.min_length_for_low_covered, params.pset.path_filtration.min_coverage).filter(paths);
+ IsolatedPathFilter(g, params.pset.path_filtration.isolated_min_length).filter(paths);
+ }
paths.SortByLength();
for(auto& path : paths) {
path.first->ResetOverlaps();
@@ -276,236 +322,414 @@ inline bool IsPolishingStage(PathExtendStage stage) {
template<class Index>
-inline shared_ptr<PairedInfoLibrary> MakeNewLib(const conj_graph_pack::graph_t& g,
- const Index& paired_index,
- size_t index) {
- const auto& lib = cfg::get().ds.reads[index];
+inline shared_ptr<PairedInfoLibrary> MakeNewLib(const config::dataset::Library& lib,
+ const conj_graph_pack::graph_t& g,
+ const Index& paired_index) {
size_t read_length = lib.data().read_length;
size_t is = (size_t) lib.data().mean_insert_size;
int is_min = (int) lib.data().insert_size_left_quantile;
int is_max = (int) lib.data().insert_size_right_quantile;
int var = (int) lib.data().insert_size_deviation;
bool is_mp = lib.type() == io::LibraryType::MatePairs || lib.type() == io::LibraryType::HQMatePairs ;
- return make_shared< PairedInfoLibraryWithIndex<decltype(paired_index[index])> >(cfg::get().K, g, read_length,
+ return make_shared< PairedInfoLibraryWithIndex<decltype(paired_index)> >(g.k(), g, read_length,
is, is_min > 0.0 ? size_t(is_min) : 0, is_max > 0.0 ? size_t(is_max) : 0,
size_t(var),
- paired_index[index], is_mp,
+ paired_index, is_mp,
lib.data().insert_size_distribution);
}
-pe_config::LongReads GetLongReadsConfig(const io::LibraryType& type) {
- auto long_reads = cfg::get().pe_params.long_reads;
+pe_config::LongReads GetLongReadsConfig(const PathExtendParamsContainer& params,
+ const io::LibraryType& type) {
if (io::SequencingLibraryBase::is_long_read_lib(type)) {
- return long_reads.pacbio_reads;
+ return params.pe_cfg.long_reads.pacbio_reads;
} else if (type == io::LibraryType::PathExtendContigs){
- return long_reads.meta_contigs;
+ return params.pe_cfg.long_reads.meta_contigs;
} else if (io::SequencingLibraryBase::is_contig_lib(type)) {
- return long_reads.contigs;
+ return params.pe_cfg.long_reads.contigs;
}
- return long_reads.single_reads;
+ return params.pe_cfg.long_reads.single_reads;
}
-inline shared_ptr<ExtensionChooser> MakeLongReadsExtensionChooser(const conj_graph_pack& gp,
- size_t lib_index,
- size_t max_repeat_length) {
+
+inline shared_ptr<ExtensionChooser> MakeLongReadsExtensionChooser(const config::dataset::Library& lib,
+ size_t lib_index,
+ const PathExtendParamsContainer& params,
+ const conj_graph_pack& gp) {
PathContainer paths;
AddPathsToContainer(gp, gp.single_long_reads[lib_index].GetAllPaths(), 1, paths);
- auto long_reads_config = GetLongReadsConfig(cfg::get().ds.reads[lib_index].type());
+ auto long_reads_config = GetLongReadsConfig(params, lib.type());
return make_shared<LongReadsExtensionChooser>(gp.g, paths, long_reads_config.filtering,
long_reads_config.weight_priority,
long_reads_config.unique_edge_priority,
long_reads_config.min_significant_overlap,
- max_repeat_length);
+ params.pset.extension_options.max_repeat_length,
+ params.uneven_depth);
}
-inline shared_ptr<SimpleExtender> MakeLongReadsExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map,
+
+inline shared_ptr<SimpleExtender> MakeLongReadsExtender(const config::dataset& dataset_info,
size_t lib_index,
- const pe_config::ParamSetT& pset) {
- const auto& lib = cfg::get().ds.reads[lib_index];
+ const PathExtendParamsContainer& params,
+ const conj_graph_pack& gp,
+ const GraphCoverageMap& cov_map) {
+ const auto& lib = dataset_info.reads[lib_index];
size_t resolvable_repeat_length_bound = 10000ul;
if (!lib.is_contig_lib()) {
resolvable_repeat_length_bound = std::max(resolvable_repeat_length_bound, lib.data().read_length);
}
INFO("resolvable_repeat_length_bound set to " << resolvable_repeat_length_bound);
- auto long_read_ec = MakeLongReadsExtensionChooser(gp, lib_index, pset.extension_options.max_repeat_length);
- return make_shared<SimpleExtender>(gp, cov_map, long_read_ec, resolvable_repeat_length_bound,
- pset.loop_removal.max_loops, true, UseCoverageResolverForSingleReads(lib.type()));
-}
-inline shared_ptr<SimpleExtender> MakeLongEdgePEExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map,
- size_t lib_index, const pe_config::ParamSetT& pset, bool investigate_loops) {
- shared_ptr<PairedInfoLibrary> lib = MakeNewLib(gp.g, gp.clustered_indices, lib_index);
- SetSingleThresholdForLib(lib, pset, cfg::get().ds.reads[lib_index].data().pi_threshold);
- INFO("Threshold for lib #" << lib_index << ": " << lib->GetSingleThreshold());
+ auto long_read_ec = MakeLongReadsExtensionChooser(lib, lib_index, params, gp);
+ return make_shared<SimpleExtender>(gp, cov_map,
+ long_read_ec,
+ resolvable_repeat_length_bound,
+ params.pset.loop_removal.max_loops,
+ true, /* investigate short loops */
+ UseCoverageResolverForSingleReads(dataset_info, lib.type()));
+}
- shared_ptr<WeightCounter> wc = make_shared<PathCoverWeightCounter>(gp.g, lib, pset.normalize_weight);
- shared_ptr<ExtensionChooser> extension = make_shared<LongEdgeExtensionChooser>(gp.g, wc, GetWeightThreshold(lib, pset), GetPriorityCoeff(lib, pset));
- return make_shared<SimpleExtender>(gp, cov_map, extension, lib->GetISMax(), pset.loop_removal.max_loops, investigate_loops, false);
+inline shared_ptr<SimpleExtender> MakeLongEdgePEExtender(const config::dataset& dataset_info,
+ size_t lib_index,
+ const PathExtendParamsContainer& params,
+ const conj_graph_pack& gp,
+ const GraphCoverageMap& cov_map,
+ bool investigate_loops) {
+
+ const auto& lib = dataset_info.reads[lib_index];
+ shared_ptr<PairedInfoLibrary> paired_lib = MakeNewLib(lib, gp.g, gp.clustered_indices[lib_index]);
+ SetSingleThresholdForLib(paired_lib, params.pset, lib.data().pi_threshold);
+ INFO("Threshold for lib #" << lib_index << ": " << paired_lib->GetSingleThreshold());
+
+ shared_ptr<WeightCounter> wc =
+ make_shared<PathCoverWeightCounter>(gp.g, paired_lib, params.pset.normalize_weight);
+ shared_ptr<ExtensionChooser> extension =
+ make_shared<LongEdgeExtensionChooser>(gp.g, wc,
+ GetWeightThreshold(paired_lib, params.pset),
+ GetPriorityCoeff(paired_lib, params.pset));
+
+ return make_shared<SimpleExtender>(gp, cov_map,
+ extension,
+ paired_lib->GetISMax(),
+ params.pset.loop_removal.max_loops,
+ investigate_loops,
+ false /*use short loop coverage resolver*/);
}
-inline shared_ptr<SimpleExtensionChooser> MakeMetaExtensionChooser(const conj_graph_pack& gp,
- shared_ptr<PairedInfoLibrary> lib,
- const pe_config::ParamSetT& pset) {
- VERIFY(cfg::get().mode == config::pipeline_type::meta);
+inline shared_ptr<SimpleExtensionChooser> MakeMetaExtensionChooser(shared_ptr<PairedInfoLibrary> lib,
+ const PathExtendParamsContainer& params,
+ const conj_graph_pack& gp,
+ size_t read_length) {
+ VERIFY(params.mode == config::pipeline_type::meta);
VERIFY(!lib->IsMp());
- shared_ptr<WeightCounter> wc = make_shared<MetagenomicWeightCounter>(gp.g, lib, /*read_length*/cfg::get().ds.RL(),
- /*normalized_threshold*/ 0.3, /*raw_threshold*/ 3, /*estimation_edge_length*/ 0);
+ shared_ptr<WeightCounter> wc = make_shared<MetagenomicWeightCounter>(gp.g,
+ lib,
+ read_length, //read_length
+ 0.3, //normalized_threshold
+ 3, //raw_threshold
+ 0 /*estimation_edge_length*/ );
return make_shared<SimpleExtensionChooser>(gp.g, wc,
- pset.extension_options.weight_threshold,
- pset.extension_options.priority_coeff);
+ params.pset.extension_options.weight_threshold,
+ params.pset.extension_options.priority_coeff);
}
-inline shared_ptr<SimpleExtender> MakeMetaExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map,
- size_t lib_index, const pe_config::ParamSetT& pset, bool investigate_loops) {
- shared_ptr<PairedInfoLibrary> lib = MakeNewLib(gp.g, gp.clustered_indices, lib_index);
- return make_shared<SimpleExtender>(gp, cov_map, MakeMetaExtensionChooser(gp, lib, pset),
- lib->GetISMax(), pset.loop_removal.max_loops,
- investigate_loops, false);
+inline shared_ptr<SimpleExtender> MakeMetaExtender(const config::dataset& dataset_info,
+ size_t lib_index,
+ const PathExtendParamsContainer& params,
+ const conj_graph_pack& gp,
+ const GraphCoverageMap& cov_map,
+ bool investigate_loops) {
+
+ const auto& lib = dataset_info.reads[lib_index];
+ shared_ptr<PairedInfoLibrary> paired_lib = MakeNewLib(lib, gp.g, gp.clustered_indices[lib_index]);
+
+ return make_shared<SimpleExtender>(gp, cov_map,
+ MakeMetaExtensionChooser(paired_lib, params, gp, dataset_info.RL()),
+ paired_lib->GetISMax(),
+ params.pset.loop_removal.max_loops,
+ investigate_loops,
+ false /*use short loop coverage resolver*/);
}
-inline shared_ptr<SimpleExtender> MakePEExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map,
- size_t lib_index, const pe_config::ParamSetT& pset, bool investigate_loops) {
- shared_ptr<PairedInfoLibrary> lib = MakeNewLib(gp.g, gp.clustered_indices, lib_index);
- SetSingleThresholdForLib(lib, pset, cfg::get().ds.reads[lib_index].data().pi_threshold);
- INFO("Threshold for lib #" << lib_index << ": " << lib->GetSingleThreshold());
-
- shared_ptr<WeightCounter> wc = make_shared<PathCoverWeightCounter>(gp.g, lib, pset.normalize_weight);
- auto extension = make_shared<SimpleExtensionChooser>(gp.g, wc, GetWeightThreshold(lib, pset), GetPriorityCoeff(lib, pset));
- return make_shared<SimpleExtender>(gp, cov_map, extension, lib->GetISMax(), pset.loop_removal.max_loops, investigate_loops, false);
+inline shared_ptr<SimpleExtender> MakePEExtender(const config::dataset& dataset_info,
+ size_t lib_index,
+ const PathExtendParamsContainer& params,
+ const conj_graph_pack& gp,
+ const GraphCoverageMap& cov_map,
+ bool investigate_loops) {
+
+ const auto& lib = dataset_info.reads[lib_index];
+ shared_ptr<PairedInfoLibrary> paired_lib = MakeNewLib(lib, gp.g, gp.clustered_indices[lib_index]);
+ SetSingleThresholdForLib(paired_lib, params.pset, lib.data().pi_threshold);
+ INFO("Threshold for lib #" << lib_index << ": " << paired_lib->GetSingleThreshold());
+
+ shared_ptr<WeightCounter> wc = make_shared<PathCoverWeightCounter>(gp.g, paired_lib, params.pset.normalize_weight);
+ auto extension = make_shared<SimpleExtensionChooser>(gp.g, wc,
+ GetWeightThreshold(paired_lib, params.pset),
+ GetPriorityCoeff(paired_lib, params.pset));
+
+ return make_shared<SimpleExtender>(gp, cov_map,
+ extension,
+ paired_lib->GetISMax(),
+ params.pset.loop_removal.max_loops,
+ investigate_loops,
+ false /*use short loop coverage resolver*/);
}
-inline shared_ptr<PathExtender> MakeScaffoldingExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map,
- size_t lib_index, const pe_config::ParamSetT& pset) {
- shared_ptr<PairedInfoLibrary> lib = MakeNewLib(gp.g, gp.scaffolding_indices, lib_index);
- shared_ptr<WeightCounter> counter = make_shared<ReadCountWeightCounter>(gp.g, lib);
- //FIXME this variable was not used!
- //double prior_coef = GetPriorityCoeff(lib, pset);
- //FIXME review parameters
- //todo put parameters in config
- //FIXME remove max_must_overlap from config
- double var_coeff = 3.0;
- auto scaff_chooser = std::make_shared<ScaffoldingExtensionChooser>(gp.g, counter, var_coeff);
+inline shared_ptr<PathExtender> MakeScaffoldingExtender(const config::dataset& dataset_info,
+ size_t lib_index,
+ const PathExtendParamsContainer& params,
+ const conj_graph_pack& gp,
+ const GraphCoverageMap& cov_map) {
+ const auto& lib = dataset_info.reads[lib_index];
+ const auto& pset = params.pset;
+ shared_ptr<PairedInfoLibrary> paired_lib = MakeNewLib(lib, gp.g, gp.scaffolding_indices[lib_index]);
- vector<shared_ptr<GapJoiner>> joiners;
+ shared_ptr<WeightCounter> counter = make_shared<ReadCountWeightCounter>(gp.g, paired_lib);
- if (pset.scaffolder_options.use_la_gap_joiner) {
+ auto scaff_chooser = std::make_shared<ScaffoldingExtensionChooser>(gp.g, counter,
+ pset.scaffolder_options.cl_threshold,
+ pset.scaffolder_options.var_coeff);
+
+ vector<shared_ptr<GapJoiner>> joiners;
+ if (params.pset.scaffolder_options.use_la_gap_joiner)
joiners.push_back(std::make_shared<LAGapJoiner>(gp.g, pset.scaffolder_options.min_overlap_length,
- pset.scaffolder_options.flank_multiplication_coefficient,
- pset.scaffolder_options.flank_addition_coefficient));
- }
+ pset.scaffolder_options.flank_multiplication_coefficient,
+ pset.scaffolder_options.flank_addition_coefficient));
- joiners.push_back(std::make_shared<HammingGapJoiner>(gp.g, pset.scaffolder_options.min_gap_score,
- pset.scaffolder_options.short_overlap,
- (int) 2 * cfg::get().ds.RL()));
+
+ joiners.push_back(std::make_shared<HammingGapJoiner>(gp.g,
+ pset.scaffolder_options.min_gap_score,
+ pset.scaffolder_options.short_overlap,
+ (int) pset.scaffolder_options.basic_overlap_coeff * dataset_info.RL()));
auto composite_gap_joiner = std::make_shared<CompositeGapJoiner>(gp.g,
joiners,
- size_t(pset.scaffolder_options.max_can_overlap * (double) gp.g.k()),
- int(math::round((double) gp.g.k() - var_coeff * (double) lib->GetIsVar())),
+ size_t(pset.scaffolder_options.max_can_overlap * (double) gp.g.k()), /* may overlap threshold */
+ int(math::round((double) gp.g.k() - pset.scaffolder_options.var_coeff * (double) paired_lib->GetIsVar())), /* must overlap threshold */
pset.scaffolder_options.artificial_gap);
- return make_shared<ScaffoldingPathExtender>(gp, cov_map, scaff_chooser, composite_gap_joiner, lib->GetISMax(), pset.loop_removal.max_loops, false);
+ return make_shared<ScaffoldingPathExtender>(gp, cov_map, scaff_chooser,
+ composite_gap_joiner,
+ paired_lib->GetISMax(),
+ pset.loop_removal.max_loops,
+ false, /* investigate short loops */
+ params.avoid_rc_connections);
+}
+
+
+inline shared_ptr<PathExtender> MakeRNAScaffoldingExtender(const config::dataset& dataset_info,
+ size_t lib_index,
+ const PathExtendParamsContainer& params,
+ const conj_graph_pack& gp,
+ const GraphCoverageMap& cov_map) {
+
+ const auto& lib = dataset_info.reads[lib_index];
+ const auto& pset = params.pset;
+ shared_ptr<PairedInfoLibrary> paired_lib = MakeNewLib(lib, gp.g, gp.paired_indices[lib_index]);
+
+ shared_ptr<WeightCounter> counter = make_shared<ReadCountWeightCounter>(gp.g, paired_lib);
+
+ auto scaff_chooser = std::make_shared<ScaffoldingExtensionChooser>(gp.g, counter, pset.scaffolder_options.cutoff, pset.scaffolder_options.var_coeff);
+ auto scaff_chooser2 = std::make_shared<ScaffoldingExtensionChooser>(gp.g, counter, pset.scaffolder_options.hard_cutoff, pset.scaffolder_options.var_coeff);
+
+ vector<shared_ptr<GapJoiner>> joiners;
+ if (params.pset.scaffolder_options.use_la_gap_joiner)
+ joiners.push_back(std::make_shared<LAGapJoiner>(gp.g, pset.scaffolder_options.min_overlap_length,
+ pset.scaffolder_options.flank_multiplication_coefficient,
+ pset.scaffolder_options.flank_addition_coefficient));
+
+
+ joiners.push_back(std::make_shared<HammingGapJoiner>(gp.g,
+ pset.scaffolder_options.min_gap_score,
+ pset.scaffolder_options.short_overlap,
+ (int) pset.scaffolder_options.basic_overlap_coeff * dataset_info.RL()));
+
+ auto composite_gap_joiner = std::make_shared<CompositeGapJoiner>(gp.g,
+ joiners,
+ size_t(pset.scaffolder_options.max_can_overlap * (double) gp.g.k()), /* may overlap threshold */
+ int(math::round((double) gp.g.k() - pset.scaffolder_options.var_coeff * (double) paired_lib->GetIsVar())), /* must overlap threshold */
+ pset.scaffolder_options.artificial_gap);
+
+ VERIFY(pset.scaffolder_options.min_overlap_for_rna_scaffolding.is_initialized());
+ return make_shared<RNAScaffoldingPathExtender>(gp, cov_map,
+ scaff_chooser,
+ scaff_chooser2,
+ composite_gap_joiner,
+ paired_lib->GetISMax(),
+ pset.loop_removal.max_loops,
+ false /* investigate short loops */,
+ *pset.scaffolder_options.min_overlap_for_rna_scaffolding);
}
-inline shared_ptr<PathExtender> MakeScaffolding2015Extender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map,
- size_t lib_index, const pe_config::ParamSetT& pset, const ScaffoldingUniqueEdgeStorage& storage) {
- shared_ptr<PairedInfoLibrary> lib;
- INFO("for lib " << lib_index);
+inline shared_ptr<PathExtender> MakeScaffolding2015Extender(const config::dataset& dataset_info,
+ size_t lib_index,
+ const PathExtendParamsContainer& params,
+ const conj_graph_pack& gp,
+ const GraphCoverageMap& cov_map,
+ const ScaffoldingUniqueEdgeStorage& storage) {
+
+ const auto& lib = dataset_info.reads[lib_index];
+ const auto& pset = params.pset;
+ shared_ptr<PairedInfoLibrary> paired_lib;
+ INFO("Creating Scaffolding 2015 extender for lib #" << lib_index);
//TODO:: temporary solution
if (gp.paired_indices[lib_index].size() > gp.clustered_indices[lib_index].size()) {
INFO("Paired unclustered indices not empty, using them");
- lib = MakeNewLib(gp.g, gp.paired_indices, lib_index);
+ paired_lib = MakeNewLib(lib, gp.g, gp.paired_indices[lib_index]);
} else if (gp.clustered_indices[lib_index].size() != 0 ) {
INFO("clustered indices not empty, using them");
- lib = MakeNewLib(gp.g, gp.clustered_indices, lib_index);
+ paired_lib = MakeNewLib(lib, gp.g, gp.clustered_indices[lib_index]);
} else {
ERROR("All paired indices are empty!");
}
- shared_ptr<WeightCounter> counter = make_shared<ReadCountWeightCounter>(gp.g, lib);
-//TODO::was copypasted from MakeScaffoldingExtender
-//TODO::REWRITE
- double var_coeff = 3.0;
- DEBUG("here creating extchooser");
-//TODO: 2 is relative weight cutoff, to config!
- auto scaff_chooser = std::make_shared<ExtensionChooser2015>(gp.g, counter, var_coeff, storage, 2, lib_index);
+ shared_ptr<WeightCounter> counter = make_shared<ReadCountWeightCounter>(gp.g, paired_lib);
+//TODO::was copypasted from MakeScaffoldingExtender, refactor 2015 extension chhoser
+ DEBUG("creating extchooser");
+
+ auto scaff_chooser = std::make_shared<ExtensionChooser2015>(gp.g,
+ counter,
+ lib_index,
+ storage,
+ pset.scaffolder_options.cl_threshold,
+ pset.scaffolder_options.var_coeff,
+ pset.scaffolding2015.relative_weight_cutoff);
auto gap_joiner = std::make_shared<HammingGapJoiner>(gp.g, pset.scaffolder_options.min_gap_score,
pset.scaffolder_options.short_overlap,
- (int) 2 * cfg::get().ds.RL());
-
- return make_shared<ScaffoldingPathExtender>(gp, cov_map, scaff_chooser, gap_joiner, lib->GetISMax(), pset.loop_removal.max_loops, false , false);
+ (int) pset.scaffolder_options.basic_overlap_coeff * dataset_info.RL());
+
+ return make_shared<ScaffoldingPathExtender>(gp, cov_map,
+ scaff_chooser,
+ gap_joiner,
+ paired_lib->GetISMax(),
+ pset.loop_removal.max_loops,
+ false, /* investigate short loops */
+ params.avoid_rc_connections,
+ false /* jump only from tips */);
}
-inline shared_ptr<SimpleExtender> MakeMPExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map, const PathContainer& paths,
- size_t lib_index, const pe_config::ParamSetT& pset) {
+inline shared_ptr<SimpleExtender> MakeMPExtender(const config::dataset& dataset_info,
+ size_t lib_index,
+ const PathExtendParamsContainer& params,
+ const conj_graph_pack& gp,
+ const GraphCoverageMap& cov_map,
+ const PathContainer& paths) {
- shared_ptr<PairedInfoLibrary> lib = MakeNewLib(gp.g, gp.paired_indices, lib_index);
- SetSingleThresholdForLib(lib, pset, cfg::get().ds.reads[lib_index].data().pi_threshold);
- INFO("Threshold for lib #" << lib_index << ": " << lib->GetSingleThreshold());
+ const auto& lib = dataset_info.reads[lib_index];
+ shared_ptr<PairedInfoLibrary> paired_lib = MakeNewLib(lib, gp.g, gp.paired_indices[lib_index]);
+
+ SetSingleThresholdForLib(paired_lib, params.pset, lib.data().pi_threshold);
+ INFO("Threshold for lib #" << lib_index << ": " << paired_lib->GetSingleThreshold());
size_t max_number_of_paths_to_search = GetNumberMPPaths(gp.g);
DEBUG("max number of mp paths " << max_number_of_paths_to_search);
- shared_ptr<MatePairExtensionChooser> chooser = make_shared<MatePairExtensionChooser>(gp.g, lib, paths, max_number_of_paths_to_search);
- return make_shared<SimpleExtender>(gp, cov_map, chooser, lib->GetISMax(), pset.loop_removal.mp_max_loops, true, false);
+ shared_ptr<MatePairExtensionChooser> chooser =
+ make_shared<MatePairExtensionChooser>(gp.g,
+ paired_lib,
+ paths,
+ max_number_of_paths_to_search,
+ params.uneven_depth);
+
+ return make_shared<SimpleExtender>(gp, cov_map,
+ chooser,
+ paired_lib->GetISMax(),
+ params.pset.loop_removal.mp_max_loops,
+ true, /* investigate short loops */
+ false /*use short loop coverage resolver*/);
}
-inline shared_ptr<SimpleExtender> MakeCoordCoverageExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map,
- const pe_config::ParamSetT& pset) {
- shared_ptr<PairedInfoLibrary> lib = MakeNewLib(gp.g, gp.clustered_indices, 0);
- CoverageAwareIdealInfoProvider provider(gp.g, lib, -1ul, 0);
+
+inline shared_ptr<SimpleExtender> MakeCoordCoverageExtender(const config::dataset& dataset_info,
+ size_t lib_index,
+ const PathExtendParamsContainer& params,
+ const conj_graph_pack& gp,
+ const GraphCoverageMap& cov_map) {
+
+ const auto& lib = dataset_info.reads[lib_index];
+ shared_ptr<PairedInfoLibrary> paired_lib = MakeNewLib(lib, gp.g, gp.clustered_indices[lib_index]);
+
+ CoverageAwareIdealInfoProvider provider(gp.g, paired_lib, -1ul, 0);
auto coord_chooser = make_shared<CoordinatedCoverageExtensionChooser>(gp.g, provider,
- pset.coordinated_coverage.max_edge_length_in_repeat,
- pset.coordinated_coverage.delta,
- pset.coordinated_coverage.min_path_len);
- auto chooser = make_shared<JointExtensionChooser>(gp.g, MakeMetaExtensionChooser(gp, lib, pset), coord_chooser);
- return make_shared<SimpleExtender>(gp, cov_map, chooser, -1ul, pset.loop_removal.mp_max_loops, true, false);
+ params.pset.coordinated_coverage.max_edge_length_in_repeat,
+ params.pset.coordinated_coverage.delta,
+ params.pset.coordinated_coverage.min_path_len);
+ auto chooser = make_shared<JointExtensionChooser>(gp.g, MakeMetaExtensionChooser(paired_lib, params, gp, dataset_info.RL()), coord_chooser);
+
+ return make_shared<SimpleExtender>(gp, cov_map, chooser,
+ -1ul /* insert size */,
+ params.pset.loop_removal.mp_max_loops,
+ true, /* investigate short loops */
+ false /*use short loop coverage resolver*/);
}
-inline shared_ptr<SimpleExtender> MakeRNAExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map,
- size_t lib_index, const pe_config::ParamSetT& pset, bool investigate_loops) {
- shared_ptr<PairedInfoLibrary> lib = MakeNewLib(gp.g, gp.clustered_indices, lib_index);
- SetSingleThresholdForLib(lib, pset, cfg::get().ds.reads[lib_index].data().pi_threshold);
- INFO("Threshold for lib #" << lib_index << ": " << lib->GetSingleThreshold());
- shared_ptr<WeightCounter> wc = make_shared<PathCoverWeightCounter>(gp.g, lib, pset.normalize_weight);
- shared_ptr<RNAExtensionChooser> extension = make_shared<RNAExtensionChooser>(gp.g, wc, GetWeightThreshold(lib, pset), GetPriorityCoeff(lib, pset));
- return make_shared<MultiExtender>(gp, cov_map, extension, lib->GetISMax(), pset.loop_removal.max_loops, investigate_loops, false);
+inline shared_ptr<SimpleExtender> MakeRNAExtender(const config::dataset& dataset_info,
+ size_t lib_index,
+ const PathExtendParamsContainer& params,
+ const conj_graph_pack& gp,
+ const GraphCoverageMap& cov_map,
+ bool investigate_loops) {
+
+ const auto& lib = dataset_info.reads[lib_index];
+ shared_ptr<PairedInfoLibrary> paired_lib = MakeNewLib(lib, gp.g, gp.clustered_indices[lib_index]);
+ SetSingleThresholdForLib(paired_lib, params.pset, lib.data().pi_threshold);
+ INFO("Threshold for lib #" << lib_index << ": " << paired_lib->GetSingleThreshold());
+
+ shared_ptr<WeightCounter> wc = make_shared<PathCoverWeightCounter>(gp.g, paired_lib, params.pset.normalize_weight);
+ shared_ptr<RNAExtensionChooser> extension =
+ make_shared<RNAExtensionChooser>(gp.g, wc,
+ GetWeightThreshold(paired_lib, params.pset),
+ GetPriorityCoeff(paired_lib, params.pset));
+
+ return make_shared<MultiExtender>(gp, cov_map,
+ extension,
+ paired_lib->GetISMax(),
+ params.pset.loop_removal.max_loops,
+ investigate_loops,
+ false /*use short loop coverage resolver*/);
}
-inline shared_ptr<SimpleExtender> MakeRNALongReadsExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map, size_t lib_index,
- const pe_config::ParamSetT& pset) {
+
+inline shared_ptr<SimpleExtender> MakeRNALongReadsExtender(const config::dataset& dataset_info,
+ size_t lib_index,
+ const PathExtendParamsContainer& params,
+ const conj_graph_pack& gp,
+ const GraphCoverageMap& cov_map) {
+
VERIFY_MSG(false, "Long reads rna extender is not implemented yet")
- const auto& lib = cfg::get().ds.reads[lib_index];
+ const auto& lib = dataset_info.reads[lib_index];
size_t resolvable_repeat_length_bound = 10000ul;
if (!lib.is_contig_lib()) {
resolvable_repeat_length_bound = std::max(resolvable_repeat_length_bound, lib.data().read_length);
}
INFO("resolvable_repeat_length_bound set to " << resolvable_repeat_length_bound);
- auto long_reads_ec = MakeLongReadsExtensionChooser(gp, lib_index, pset.extension_options.max_repeat_length);
- return make_shared<SimpleExtender>(gp, cov_map, long_reads_ec, resolvable_repeat_length_bound,
- pset.loop_removal.max_loops, true, UseCoverageResolverForSingleReads(lib.type()));
-}
+ auto long_reads_ec = MakeLongReadsExtensionChooser(lib, lib_index, params, gp);
-inline bool InsertSizeCompare(const shared_ptr<PairedInfoLibrary> lib1,
- const shared_ptr<PairedInfoLibrary> lib2) {
- return lib1->GetISMax() < lib2->GetISMax();
+ return make_shared<SimpleExtender>(gp, cov_map,
+ long_reads_ec,
+ resolvable_repeat_length_bound,
+ params.pset.loop_removal.max_loops,
+ true, /* investigate short loops */
+ UseCoverageResolverForSingleReads(dataset_info, lib.type()));
}
+
template<typename Base, typename T>
inline bool instanceof(const T *ptr) {
return dynamic_cast<const Base*>(ptr) != nullptr;
}
+
//Used for debug purpose only
inline void PrintExtenders(vector<shared_ptr<PathExtender> >& extenders) {
DEBUG("Extenders in vector:");
@@ -525,8 +749,13 @@ inline void PrintExtenders(vector<shared_ptr<PathExtender> >& extenders) {
}
}
-inline vector<shared_ptr<PathExtender> > MakeAllExtenders(PathExtendStage stage, const conj_graph_pack& gp, const GraphCoverageMap& cov_map,
- const pe_config::ParamSetT& pset, const ScaffoldingUniqueEdgeStorage& storage, const PathContainer& paths_for_mp = PathContainer()) {
+inline vector<shared_ptr<PathExtender> > MakeAllExtenders(PathExtendStage stage,
+ const config::dataset& dataset_info,
+ const PathExtendParamsContainer& params,
+ const conj_graph_pack& gp,
+ const GraphCoverageMap& cov_map,
+ const ScaffoldingUniqueEdgeStorage& storage,
+ const PathContainer& paths_for_mp = PathContainer()) {
vector<shared_ptr<PathExtender> > result;
vector<shared_ptr<PathExtender> > pes;
@@ -540,64 +769,96 @@ inline vector<shared_ptr<PathExtender> > MakeAllExtenders(PathExtendStage stage,
size_t scf_pe_libs = 0;
size_t mp_libs = 0;
+ const auto& pset = params.pset;
+
for (io::LibraryType lt : io::LibraryPriotity) {
- for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i) {
- const auto& lib = cfg::get().ds.reads[i];
+ for (size_t lib_index = 0; lib_index < dataset_info.reads.lib_count(); ++lib_index) {
+ const auto& lib = dataset_info.reads[lib_index];
if (lib.type() != lt)
continue;
//TODO: scaff2015 does not need any single read libs?
if (IsForSingleReadExtender(lib) && pset.sm != sm_2015) {
- result.push_back(MakeLongReadsExtender(gp, cov_map, i, pset));
+ result.push_back(MakeLongReadsExtender(dataset_info, lib_index, params, gp, cov_map));
++single_read_libs;
}
if (IsForPEExtender(lib)) {
++pe_libs;
- if (IsPEStage(stage) && (pset.sm == sm_old_pe_2015 || pset.sm == sm_old || pset.sm == sm_combined)) {
- if (cfg::get().mode == config::pipeline_type::meta)
+ if (IsPEStage(stage) && IsOldPEEnabled(pset.sm)) {
+ if (params.mode == config::pipeline_type::meta)
//TODO proper configuration via config
- pes.push_back(MakeMetaExtender(gp, cov_map, i, pset, false));
- else if (cfg::get().mode == config::pipeline_type::moleculo)
- pes.push_back(MakeLongEdgePEExtender(gp, cov_map, i, pset, false));
- else if (cfg::get().mode == config::pipeline_type::rna && !IsPolishingStage(stage))
- pes.push_back(MakeRNAExtender(gp, cov_map, i, pset, false));
+ pes.push_back(MakeMetaExtender(dataset_info, lib_index, params, gp, cov_map, false));
+ else if (params.mode == config::pipeline_type::moleculo)
+ pes.push_back(MakeLongEdgePEExtender(dataset_info, lib_index, params, gp, cov_map, false));
+ else if (pset.multi_path_extend && !IsPolishingStage(stage))
+ pes.push_back(MakeRNAExtender(dataset_info, lib_index, params, gp, cov_map, false));
else
- pes.push_back(MakePEExtender(gp, cov_map, i, pset, false));
+ pes.push_back(MakePEExtender(dataset_info, lib_index, params, gp, cov_map, false));
}
else if (pset.sm == sm_2015) {
- pes2015.push_back(MakeScaffolding2015Extender(gp, cov_map, i, pset, storage));
+ pes2015.push_back(MakeScaffolding2015Extender(dataset_info, lib_index, params, gp, cov_map, storage));
}
}
//FIXME logic is very cryptic!
- if (IsForShortLoopExtender(lib) && (pset.sm == sm_old_pe_2015 || pset.sm == sm_old || pset.sm == sm_combined)) {
- if (cfg::get().mode == config::pipeline_type::meta)
- pes.push_back(MakeMetaExtender(gp, cov_map, i, pset, true));
- else if (cfg::get().mode == config::pipeline_type::rna && !IsPolishingStage(stage))
- pes.push_back(MakeRNAExtender(gp, cov_map, i, pset, true));
+ if (IsForShortLoopExtender(lib) && IsOldPEEnabled(pset.sm)) {
+ if (params.mode == config::pipeline_type::meta)
+ pes.push_back(MakeMetaExtender(dataset_info, lib_index, params, gp, cov_map, true));
+ else if (pset.multi_path_extend && !IsPolishingStage(stage))
+ pes.push_back(MakeRNAExtender(dataset_info, lib_index, params, gp, cov_map, true));
else
- pe_loops.push_back(MakePEExtender(gp, cov_map, i, pset, true));
+ pe_loops.push_back(MakePEExtender(dataset_info, lib_index, params, gp, cov_map, true));
}
- if (IsForScaffoldingExtender(lib) && cfg::get().use_scaffolder && pset.scaffolder_options.on) {
+ if (IsForScaffoldingExtender(lib) && params.use_scaffolder && pset.scaffolder_options.enabled) {
++scf_pe_libs;
- if (pset.sm == sm_old || pset.sm == sm_combined) {
- pe_scafs.push_back(MakeScaffoldingExtender(gp, cov_map, i, pset));
+ if (params.mode == config::pipeline_type::rna) {
+ pe_scafs.push_back(MakeRNAScaffoldingExtender(dataset_info, lib_index, params, gp, cov_map));
}
- if (pset.sm == sm_old_pe_2015 || pset.sm == sm_combined) {
- pe_scafs.push_back(MakeScaffolding2015Extender(gp, cov_map, i, pset, storage));
+ else {
+ switch (pset.sm) {
+ case sm_old: {
+ pe_scafs.push_back(MakeScaffoldingExtender(dataset_info, lib_index, params, gp, cov_map));
+ break;
+ }
+ case sm_old_pe_2015: {
+ pe_scafs.push_back(MakeScaffolding2015Extender(dataset_info, lib_index, params, gp, cov_map, storage));
+ break;
+ }
+ case sm_combined: {
+ pe_scafs.push_back(MakeScaffoldingExtender(dataset_info, lib_index, params, gp, cov_map));
+ pe_scafs.push_back(MakeScaffolding2015Extender(dataset_info, lib_index, params, gp, cov_map, storage));
+ break;
+ }
+ default:
+ break;
+ }
}
}
if (IsForMPExtender(lib) && IsMPStage(stage)) {
++mp_libs;
- if (pset.sm == sm_old || pset.sm == sm_combined) {
- mps.push_back(MakeMPExtender(gp, cov_map, paths_for_mp, i, pset));
- }
- if (is_2015_scaffolder_enabled(pset.sm)) {
- mps.push_back(MakeScaffolding2015Extender(gp, cov_map, i, pset, storage));
+ switch (pset.sm) {
+ case sm_old: {
+ mps.push_back(MakeMPExtender(dataset_info, lib_index, params, gp, cov_map, paths_for_mp));
+ break;
+ }
+ case sm_old_pe_2015: {
+ mps.push_back(MakeScaffolding2015Extender(dataset_info, lib_index, params, gp, cov_map, storage));
+ break;
+ }
+ case sm_2015: {
+ mps.push_back(MakeScaffolding2015Extender(dataset_info, lib_index, params, gp, cov_map, storage));
+ break;
+ }
+ case sm_combined: {
+ mps.push_back(MakeMPExtender(dataset_info, lib_index, params, gp, cov_map, paths_for_mp));
+ mps.push_back(MakeScaffolding2015Extender(dataset_info, lib_index, params, gp, cov_map, storage));
+ break;
+ }
+ default:
+ break;
}
}
}
- //std::sort(scaff_libs.begin(), scaff_libs.end(), InsertSizeCompare);
result.insert(result.end(), pes.begin(), pes.end());
result.insert(result.end(), pes2015.begin(), pes2015.end());
result.insert(result.end(), pe_loops.begin(), pe_loops.end());
@@ -614,34 +875,35 @@ inline vector<shared_ptr<PathExtender> > MakeAllExtenders(PathExtendStage stage,
INFO("Using " << scf_pe_libs << " paired-end scaffolding " << LibStr(scf_pe_libs));
INFO("Using " << mp_libs << " mate-pair " << LibStr(mp_libs));
INFO("Using " << single_read_libs << " single read " << LibStr(single_read_libs));
- INFO("Scaffolder is " << (pset.scaffolder_options.on ? "on" : "off"));
+ INFO("Scaffolder is " << (pset.scaffolder_options.enabled ? "on" : "off"));
if (pset.use_coordinated_coverage) {
INFO("Using additional coordinated coverage extender");
- result.push_back(MakeCoordCoverageExtender(gp, cov_map, pset));
+ result.push_back(MakeCoordCoverageExtender(dataset_info, 0 /* lib index */, params, gp, cov_map));
}
PrintExtenders(result);
return result;
}
-inline shared_ptr<scaffold_graph::ScaffoldGraph> ConstructScaffoldGraph(const conj_graph_pack& gp,
- const ScaffoldingUniqueEdgeStorage& edge_storage,
- const pe_config::ParamSetT::ScaffoldGraphParamsT& params) {
+inline shared_ptr<scaffold_graph::ScaffoldGraph> ConstructScaffoldGraph(const config::dataset& dataset_info,
+ const pe_config::ParamSetT::ScaffoldGraphParamsT& params,
+ const conj_graph_pack& gp,
+ const ScaffoldingUniqueEdgeStorage& edge_storage) {
using namespace scaffold_graph;
vector<shared_ptr<ConnectionCondition>> conditions;
INFO("Constructing connections");
LengthEdgeCondition edge_condition(gp.g, edge_storage.GetMinLength());
- for (size_t lib_index = 0; lib_index < cfg::get().ds.reads.lib_count(); ++lib_index) {
- auto lib = cfg::get().ds.reads[lib_index];
+ for (size_t lib_index = 0; lib_index < dataset_info.reads.lib_count(); ++lib_index) {
+ const auto& lib = dataset_info.reads[lib_index];
if (lib.is_paired()) {
shared_ptr<PairedInfoLibrary> paired_lib;
if (IsForMPExtender(lib))
- paired_lib = MakeNewLib(gp.g, gp.paired_indices, lib_index);
+ paired_lib = MakeNewLib(lib, gp.g, gp.paired_indices[lib_index]);
else if (IsForPEExtender(lib))
- paired_lib = MakeNewLib(gp.g, gp.clustered_indices, lib_index);
+ paired_lib = MakeNewLib(lib, gp.g, gp.clustered_indices[lib_index]);
else
INFO("Unusable paired lib #" << lib_index);
conditions.push_back(make_shared<AdvancedPairedConnectionCondition>(gp.g, paired_lib, lib_index,
@@ -671,7 +933,7 @@ inline shared_ptr<scaffold_graph::ScaffoldGraph> ConstructScaffoldGraph(const co
inline void PrintScaffoldGraph(shared_ptr<scaffold_graph::ScaffoldGraph> scaffoldGraph,
- const set<EdgeId> main_edge_set,
+ const set<EdgeId>& main_edge_set,
const string& filename) {
using namespace scaffold_graph;
@@ -701,9 +963,9 @@ inline void PrintScaffoldGraph(shared_ptr<scaffold_graph::ScaffoldGraph> scaffol
}
-inline size_t FindOverlapLenForStage(PathExtendStage stage) {
+inline size_t FindOverlapLenForStage(PathExtendStage stage, const config::dataset& dataset_info) {
size_t res = 0;
- for (const auto& lib : cfg::get().ds.reads) {
+ for (const auto& lib : dataset_info.reads) {
if (IsForPEExtender(lib) && IsPEStage(stage)) {
res = max(res, (size_t) lib.data().insert_size_right_quantile);
} else if (IsForShortLoopExtender(lib)) {
@@ -715,8 +977,8 @@ inline size_t FindOverlapLenForStage(PathExtendStage stage) {
return res;
}
-inline bool MPLibsExist() {
- for (const auto& lib : cfg::get().ds.reads)
+inline bool MPLibsExist(const config::dataset& dataset_info) {
+ for (const auto& lib : dataset_info.reads)
if (IsForMPExtender(lib))
return true;
@@ -744,23 +1006,25 @@ inline void CountMisassembliesWithReference(debruijn_graph::GenomeConsistenceChe
}
inline ScaffoldingUniqueEdgeStorage FillUniqueEdgeStorage(const conj_graph_pack& gp,
- size_t& min_unique_length,
- double& unique_variation) {
+ const config::dataset& dataset_info,
+ size_t& min_unique_length,
+ double& unique_variation,
+ bool autodetect) {
ScaffoldingUniqueEdgeStorage main_unique_storage;
//Setting scaffolding2015 parameters
- if (cfg::get().pe_params.param_set.scaffolding2015.autodetect) {
+ if (autodetect) {
INFO("Autodetecting unique edge set parameters...");
bool pe_found = false;
//TODO constants
size_t min_MP_IS = 10000;
- for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i) {
+ for (size_t i = 0; i < dataset_info.reads.lib_count(); ++i) {
- if (IsForPEExtender(cfg::get().ds.reads[i])) {
+ if (IsForPEExtender(dataset_info.reads[i])) {
pe_found = true;
}
- if (IsForMPExtender(cfg::get().ds.reads[i])) {
- min_MP_IS = min(min_MP_IS, (size_t) cfg::get().ds.reads[i].data().mean_insert_size);
+ if (IsForMPExtender(dataset_info.reads[i])) {
+ min_MP_IS = min(min_MP_IS, (size_t) dataset_info.reads[i].data().mean_insert_size);
}
}
if (pe_found) {
@@ -784,95 +1048,107 @@ inline ScaffoldingUniqueEdgeStorage FillUniqueEdgeStorage(const conj_graph_pack&
return main_unique_storage;
}
-inline void ResolveRepeatsPe(conj_graph_pack& gp,
- const std::string& output_dir,
- const std::string& contigs_name,
- bool traversLoops,
- boost::optional<std::string> broken_contigs) {
+
+inline void ResolveRepeatsPe(const config::dataset& dataset_info,
+ const PathExtendParamsContainer& params,
+ conj_graph_pack& gp) {
INFO("ExSPAnder repeat resolving tool started");
+ const pe_config::ParamSetT &pset = params.pset;
ScaffoldingUniqueEdgeStorage main_unique_storage;
- auto sc_mode = cfg::get().pe_params.param_set.sm;
- auto min_unique_length = cfg::get().pe_params.param_set.scaffolding2015.min_unique_length;
- auto unique_variaton = cfg::get().pe_params.param_set.scaffolding2015.unique_coverage_variation;
-
- if (is_2015_scaffolder_enabled(sc_mode)) {
- main_unique_storage = FillUniqueEdgeStorage(gp, min_unique_length, unique_variaton);
+ auto sc_mode = pset.sm;
+ auto min_unique_length = pset.scaffolding2015.min_unique_length;
+ auto unique_variaton = pset.scaffolding2015.unique_coverage_variation;
+ bool detect_repeats_online = !(IsScaffolder2015Enabled(sc_mode) || params.mode == config::pipeline_type::meta);
+
+ //Fill the storage to enable unique edge check
+ if (IsScaffolder2015Enabled(sc_mode)) {
+ main_unique_storage = FillUniqueEdgeStorage(gp, dataset_info,
+ min_unique_length,
+ unique_variaton,
+ pset.scaffolding2015.autodetect);
}
- make_dir(output_dir);
- make_dir(GetEtcDir(output_dir));
- const pe_config::ParamSetT &pset = cfg::get().pe_params.param_set;
+ make_dir(params.output_dir);
+ make_dir(params.etc_dir);
+
//Scaffold graph
shared_ptr<scaffold_graph::ScaffoldGraph> scaffoldGraph;
- if (cfg::get().pe_params.param_set.scaffold_graph_params.construct) {
- scaffoldGraph = ConstructScaffoldGraph(gp, main_unique_storage, cfg::get().pe_params.param_set.scaffold_graph_params);
- if (cfg::get().pe_params.param_set.scaffold_graph_params.output) {
- PrintScaffoldGraph(scaffoldGraph, main_unique_storage.GetSet(), GetEtcDir(output_dir) + "scaffold_graph");
+ if (pset.scaffold_graph_params.construct) {
+ scaffoldGraph = ConstructScaffoldGraph(dataset_info, params.pset.scaffold_graph_params, gp, main_unique_storage);
+ if (pset.scaffold_graph_params.output) {
+ PrintScaffoldGraph(scaffoldGraph, main_unique_storage.GetSet(), params.etc_dir + "scaffold_graph");
}
}
DefaultContigCorrector<ConjugateDeBruijnGraph> corrector(gp.g);
DefaultContigConstructor<ConjugateDeBruijnGraph> constructor(gp.g, corrector);
- ContigWriter writer(gp.g, constructor, gp.components);
+ ContigWriter writer(gp.g, constructor, gp.components, params.mode == config::pipeline_type::plasmid);
//make pe + long reads extenders
GraphCoverageMap cover_map(gp.g);
INFO("SUBSTAGE = paired-end libraries")
PathExtendStage exspander_stage = PathExtendStage::PEStage;
- vector<shared_ptr<PathExtender> > all_libs = MakeAllExtenders(exspander_stage, gp, cover_map, pset,
- main_unique_storage);
+ vector<shared_ptr<PathExtender> > all_libs =
+ MakeAllExtenders(exspander_stage, dataset_info, params, gp, cover_map, main_unique_storage);
//Parameters are subject to change
- size_t max_is_right_quantile = max(FindOverlapLenForStage(exspander_stage), gp.g.k() + 100);
+ size_t max_is_right_quantile = max(FindOverlapLenForStage(exspander_stage, dataset_info), gp.g.k() + 100);
size_t min_edge_len = 100;
+ size_t max_edge_diff_pe = /*cfg::get().mode == config::pipeline_type::rna ? 0 :*/ max_is_right_quantile;
shared_ptr<CompositeExtender> mainPE = make_shared<CompositeExtender>(gp.g, cover_map, all_libs,
- max_is_right_quantile, main_unique_storage,
- cfg::get().pe_params.param_set.extension_options.max_repeat_length);
+ main_unique_storage,
+ max_is_right_quantile,
+ pset.extension_options.max_repeat_length,
+ detect_repeats_online);
//extend pe + long reads
PathExtendResolver resolver(gp.g);
auto seeds = resolver.makeSimpleSeeds();
- DebugOutputPaths(gp, output_dir, seeds, "init_paths");
+ DebugOutputPaths(gp, params, seeds, "init_paths");
seeds.SortByLength();
INFO("Growing paths using paired-end and long single reads");
+ INFO("Multi path extend is " << (cfg::get().pe_params.param_set.multi_path_extend ? "on" : "off"))
+ INFO("Overlap removal is " << (cfg::get().pe_params.param_set.remove_overlaps ? "on" : "off"))
auto paths = resolver.extendSeeds(seeds, *mainPE);
paths.SortByLength();
- DebugOutputPaths(gp, output_dir, paths, "pe_before_overlap");
+ DebugOutputPaths(gp, params, paths, "pe_before_overlap");
PathContainer clone_paths;
GraphCoverageMap clone_map(gp.g);
- bool mp_exist = MPLibsExist();
+ bool mp_exist = MPLibsExist(dataset_info);
if (mp_exist) {
ClonePathContainer(paths, clone_paths, clone_map);
}
exspander_stage = PathExtendStage::PEPolishing;
- all_libs = MakeAllExtenders(exspander_stage, gp, cover_map, pset, main_unique_storage);
+ all_libs = MakeAllExtenders(exspander_stage, dataset_info, params, gp, cover_map, main_unique_storage);
mainPE = make_shared<CompositeExtender>(gp.g, cover_map, all_libs,
- max_is_right_quantile, main_unique_storage,
- cfg::get().pe_params.param_set.extension_options.max_repeat_length);
+ main_unique_storage,
+ max_is_right_quantile,
+ pset.extension_options.max_repeat_length,
+ detect_repeats_online);
//We do not run overlap removal in 2015 mode
- if (!is_2015_scaffolder_enabled(sc_mode))
- FinalizePaths(paths, cover_map, min_edge_len, max_is_right_quantile);
- if (broken_contigs.is_initialized()) {
- OutputBrokenScaffolds(paths, (int) gp.g.k(), writer,
- output_dir + (mp_exist ? "pe_contigs" : broken_contigs.get()));
- }
- DebugOutputPaths(gp, output_dir, paths, "pe_before_traverse");
- if (traversLoops) {
+ if (!IsScaffolder2015Enabled(sc_mode))
+ FinalizePaths(params, paths, gp.g, cover_map, min_edge_len, max_edge_diff_pe);
+ if (params.output_broken_scaffolds) {
+ OutputBrokenScaffolds(paths, params, (int) gp.g.k(), writer,
+ params.output_dir + (mp_exist ? "pe_contigs" : params.broken_contigs));
+ }
+ DebugOutputPaths(gp, params, paths, "pe_before_traverse");
+ if (params.traverse_loops) {
TraverseLoops(paths, cover_map, mainPE);
- FinalizePaths(paths, cover_map, min_edge_len, max_is_right_quantile);
+ FinalizePaths(params, paths, gp.g, cover_map, min_edge_len, max_edge_diff_pe);
}
- DebugOutputPaths(gp, output_dir, paths, (mp_exist ? "pe_final_paths" : "final_paths"));
- writer.OutputPaths(paths, output_dir + (mp_exist ? "pe_scaffolds" : contigs_name));
+ DebugOutputPaths(gp, params, paths, (mp_exist ? "pe_final_paths" : "final_paths"));
+ writer.OutputPaths(paths, params.output_dir + (mp_exist ? "pe_scaffolds" : params.contigs_name));
cover_map.Clear();
seeds.DeleteAllPaths();
@@ -882,42 +1158,44 @@ inline void ResolveRepeatsPe(conj_graph_pack& gp,
}
//MP
- DebugOutputPaths(gp, output_dir, clone_paths, "mp_before_extend");
+ DebugOutputPaths(gp, params, clone_paths, "mp_before_extend");
INFO("SUBSTAGE = mate-pair libraries ")
exspander_stage = PathExtendStage::MPStage;
all_libs.clear();
- max_is_right_quantile = FindOverlapLenForStage(exspander_stage);
+ max_is_right_quantile = FindOverlapLenForStage(exspander_stage, dataset_info);
PathContainer mp_paths(clone_paths);
- if (is_2015_scaffolder_enabled(sc_mode)) {
+ if (IsScaffolder2015Enabled(sc_mode)) {
//TODO: constants
for (auto cur_length = min_unique_length; cur_length > 500; cur_length -= 500) {
ScaffoldingUniqueEdgeStorage current_unique_storage;
ScaffoldingUniqueEdgeAnalyzer unique_edge_analyzer(gp, cur_length, unique_variaton);
unique_edge_analyzer.FillUniqueEdgeStorage(current_unique_storage);
- all_libs = MakeAllExtenders(exspander_stage, gp, clone_map, pset, current_unique_storage, clone_paths);
+ all_libs = MakeAllExtenders(exspander_stage, dataset_info, params, gp, clone_map, current_unique_storage, clone_paths);
shared_ptr<CompositeExtender> mp_main_pe = make_shared<CompositeExtender>(gp.g, clone_map, all_libs,
- max_is_right_quantile,
main_unique_storage,
- cfg::get().pe_params.param_set.extension_options.max_repeat_length);
+ max_is_right_quantile,
+ pset.extension_options.max_repeat_length,
+ detect_repeats_online);
INFO("Growing paths using mate-pairs unique length " << cur_length);
mp_paths = resolver.extendSeeds(mp_paths, *mp_main_pe);
- DebugOutputPaths(gp, output_dir, mp_paths, "mp_before_overlap_" + std::to_string(cur_length));
+ DebugOutputPaths(gp, params, mp_paths, "mp_before_overlap_" + std::to_string(cur_length));
}
} else {
- all_libs = MakeAllExtenders(exspander_stage, gp, clone_map, pset, main_unique_storage, clone_paths);
+ all_libs = MakeAllExtenders(exspander_stage, dataset_info, params, gp, clone_map, main_unique_storage, clone_paths);
shared_ptr<CompositeExtender> mp_main_pe = make_shared<CompositeExtender>(gp.g, clone_map, all_libs,
- max_is_right_quantile,
main_unique_storage,
- cfg::get().pe_params.param_set.extension_options.max_repeat_length);
+ max_is_right_quantile,
+ pset.extension_options.max_repeat_length,
+ detect_repeats_online);
INFO("Growing paths using mate-pairs");
mp_paths = resolver.extendSeeds(clone_paths, *mp_main_pe);
- DebugOutputPaths(gp, output_dir, mp_paths, "mp_before_overlap");
- FinalizePaths(mp_paths, clone_map, max_is_right_quantile, max_is_right_quantile, true);
+ DebugOutputPaths(gp, params, mp_paths, "mp_before_overlap");
+ FinalizePaths(params, mp_paths, gp.g, clone_map, max_is_right_quantile, max_is_right_quantile, true);
}
- DebugOutputPaths(gp, output_dir, mp_paths, "mp_final_paths");
+ DebugOutputPaths(gp, params, mp_paths, "mp_final_paths");
DEBUG("Paths are grown with mate-pairs");
//MP end
@@ -926,35 +1204,39 @@ inline void ResolveRepeatsPe(conj_graph_pack& gp,
INFO("SUBSTAGE = polishing paths")
exspander_stage = PathExtendStage::FinalizingPEStage;
all_libs.clear();
- all_libs = MakeAllExtenders(exspander_stage, gp, clone_map, pset, main_unique_storage);
- max_is_right_quantile = FindOverlapLenForStage(exspander_stage);
+ all_libs = MakeAllExtenders(exspander_stage, dataset_info, params, gp, clone_map, main_unique_storage);
+ max_is_right_quantile = FindOverlapLenForStage(exspander_stage, dataset_info);
shared_ptr<CompositeExtender> last_extender = make_shared<CompositeExtender>(gp.g, clone_map, all_libs,
- max_is_right_quantile, main_unique_storage,
- cfg::get().pe_params.param_set.extension_options.max_repeat_length);
+ main_unique_storage,
+ max_is_right_quantile,
+ pset.extension_options.max_repeat_length,
+ detect_repeats_online);
auto last_paths = resolver.extendSeeds(mp_paths, *last_extender);
- DebugOutputPaths(gp, output_dir, last_paths, "mp2_before_overlap");
+ DebugOutputPaths(gp, params, last_paths, "mp2_before_overlap");
exspander_stage = PathExtendStage::FinalPolishing;
- all_libs = MakeAllExtenders(exspander_stage, gp, clone_map, pset, main_unique_storage);
+ all_libs = MakeAllExtenders(exspander_stage, dataset_info, params, gp, clone_map, main_unique_storage);
last_extender = make_shared<CompositeExtender>(gp.g, clone_map, all_libs,
- max_is_right_quantile, main_unique_storage,
- cfg::get().pe_params.param_set.extension_options.max_repeat_length);
- if (!is_2015_scaffolder_enabled(sc_mode)) {
- FinalizePaths(last_paths, clone_map, min_edge_len, max_is_right_quantile);
- DebugOutputPaths(gp, output_dir, last_paths, "mp2_before_traverse");
+ main_unique_storage,
+ max_is_right_quantile,
+ pset.extension_options.max_repeat_length,
+ detect_repeats_online);
+ if (!IsScaffolder2015Enabled(sc_mode)) {
+ FinalizePaths(params, last_paths, gp.g, clone_map, min_edge_len, max_is_right_quantile);
+ DebugOutputPaths(gp, params, last_paths, "mp2_before_traverse");
}
TraverseLoops(last_paths, clone_map, last_extender);
- FinalizePaths(last_paths, clone_map, min_edge_len, max_is_right_quantile);
+ FinalizePaths(params, last_paths, gp.g, clone_map, min_edge_len, max_is_right_quantile);
//result
- if (broken_contigs.is_initialized()) {
- OutputBrokenScaffolds(last_paths, (int) gp.g.k(), writer, output_dir + broken_contigs.get());
+ if (params.output_broken_scaffolds) {
+ OutputBrokenScaffolds(last_paths, params, (int) gp.g.k(), writer, params.output_dir + params.broken_contigs);
}
debruijn_graph::GenomeConsistenceChecker genome_checker (gp, main_unique_storage, 1000, 0.2);
- DebugOutputPaths(gp, output_dir, last_paths, "mp2_final_paths");
- writer.OutputPaths(last_paths, output_dir + contigs_name);
+ DebugOutputPaths(gp, params, last_paths, "mp2_final_paths");
+ writer.OutputPaths(last_paths, params.output_dir + params.contigs_name);
if (gp.genome.size() > 0)
CountMisassembliesWithReference(genome_checker, last_paths);
//FinalizeUniquenessPaths();
diff --git a/src/modules/algorithms/path_extend/path_extender.hpp b/src/modules/algorithms/path_extend/path_extender.hpp
index 628a3ab..0c8bda5 100644
--- a/src/modules/algorithms/path_extend/path_extender.hpp
+++ b/src/modules/algorithms/path_extend/path_extender.hpp
@@ -459,7 +459,8 @@ public:
GapJoiner(g), min_la_length_(min_la_length), flank_addition_coefficient_(
flank_addition_coefficient), flank_multiplication_coefficient_(
flank_multiplication_coefficient) {
- DEBUG("flank_multiplication_coefficient - " << flank_multiplication_coefficient_); DEBUG("flank_addition_coefficient_ - " << flank_addition_coefficient_ );
+ DEBUG("flank_multiplication_coefficient - " << flank_multiplication_coefficient_);
+ DEBUG("flank_addition_coefficient_ - " << flank_addition_coefficient_ );
}
Gap FixGap(EdgeId source, EdgeId sink, int initial_gap) const override {
@@ -488,7 +489,7 @@ public:
return Gap(INVALID_GAP);
}
- if (overlap_info.identity() < IDENTITY_RATIO) {
+ if (math::ls(overlap_info.identity(), IDENTITY_RATIO)) {
DEBUG("Low identity score");
return Gap(INVALID_GAP);
}
@@ -775,20 +776,31 @@ struct UsedUniqueStorage {
set<EdgeId> used_;
const ScaffoldingUniqueEdgeStorage& unique_;
+
+ UsedUniqueStorage(const ScaffoldingUniqueEdgeStorage& unique ):used_(), unique_(unique) {}
+
void insert(EdgeId e) {
if (unique_.IsUnique(e)) {
used_.insert(e);
used_.insert(e->conjugate());
}
}
- bool IsUsedAndUnique (EdgeId e) {
+
+ bool IsUsedAndUnique(EdgeId e) const {
return (unique_.IsUnique(e) && used_.find(e) != used_.end());
}
- UsedUniqueStorage(const ScaffoldingUniqueEdgeStorage& unique ):used_(), unique_(unique) {}
+
+ bool UniqueCheckEnabled() const {
+ return unique_.size() > 0;
+ }
+
+
};
+
class PathExtender {
public:
- PathExtender(const Graph & g): g_(g){ }
+ PathExtender(const Graph & g):
+ g_(g){ }
virtual ~PathExtender() { }
@@ -805,22 +817,34 @@ protected:
class CompositeExtender : public ContigsMaker {
public:
- CompositeExtender(Graph & g, GraphCoverageMap& cov_map, size_t max_diff_len, size_t max_repeat_length)
+ CompositeExtender(Graph & g, GraphCoverageMap& cov_map,
+ size_t max_diff_len,
+ size_t max_repeat_length,
+ bool detect_repeats_online)
: ContigsMaker(g),
cover_map_(cov_map),
repeat_detector_(g, cover_map_, 2 * max_repeat_length),
extenders_(),
- max_diff_len_(max_diff_len) {
+ max_diff_len_(max_diff_len),
+ max_repeat_len_(max_repeat_length),
+ detect_repeats_online_(detect_repeats_online) {
}
- CompositeExtender(Graph & g, GraphCoverageMap& cov_map, vector<shared_ptr<PathExtender> > pes, size_t max_diff_len, const ScaffoldingUniqueEdgeStorage& unique, size_t max_repeat_length)
+ CompositeExtender(Graph & g, GraphCoverageMap& cov_map,
+ vector<shared_ptr<PathExtender> > pes,
+ const ScaffoldingUniqueEdgeStorage& unique,
+ size_t max_diff_len,
+ size_t max_repeat_length,
+ bool detect_repeats_online)
: ContigsMaker(g),
cover_map_(cov_map),
repeat_detector_(g, cover_map_, 2 * max_repeat_length),
extenders_(),
- max_diff_len_(max_diff_len) {
+ max_diff_len_(max_diff_len),
+ max_repeat_len_(max_repeat_length),
+ detect_repeats_online_(detect_repeats_online) {
extenders_ = pes;
- used_storage_ = make_shared<UsedUniqueStorage>(UsedUniqueStorage( unique));
+ used_storage_ = make_shared<UsedUniqueStorage>(UsedUniqueStorage(unique));
for (auto ex: extenders_) {
ex->AddUniqueEdgeStorage(used_storage_);
}
@@ -846,15 +870,9 @@ public:
while (MakeGrowStep(path, paths_storage, false)) { }
}
- bool MakeGrowStep(BidirectionalPath& path, PathContainer* paths_storage, bool detect_repeats_online = true) {
+ bool MakeGrowStep(BidirectionalPath& path, PathContainer* paths_storage, bool detect_repeats_online_local = true) {
DEBUG("make grow step composite extender");
- auto sc_mode = cfg::get().pe_params.param_set.sm;
- if (is_2015_scaffolder_enabled(sc_mode) || cfg::get().mode == config::pipeline_type::meta) {
- DEBUG("force switch off online repeats detect, 2015 on");
- //FIXME disable for all!
- detect_repeats_online = false;
- }
- if (detect_repeats_online) {
+ if (detect_repeats_online_ && detect_repeats_online_local) {
BidirectionalPath *repeat_path = repeat_detector_.RepeatPath(path);
size_t repeat_size = repeat_detector_.MaxCommonSize(path, *repeat_path);
@@ -901,7 +919,7 @@ public:
size_t current = 0;
while (current < extenders_.size()) {
- DEBUG("step " << current << " from " <<extenders_.size());
+ DEBUG("step " << current << " of total " << extenders_.size());
if (extenders_[current]->MakeGrowStep(path, paths_storage)) {
return true;
}
@@ -915,6 +933,8 @@ private:
RepeatDetector repeat_detector_;
vector<shared_ptr<PathExtender> > extenders_;
size_t max_diff_len_;
+ size_t max_repeat_len_;
+ bool detect_repeats_online_;
shared_ptr<UsedUniqueStorage> used_storage_;
void SubscribeCoverageMap(BidirectionalPath * path) {
@@ -932,8 +952,7 @@ private:
INFO("Processed " << i << " paths from " << paths.size() << " (" << i * 100 / paths.size() << "%)");
}
//In 2015 modes do not use a seed already used in paths.
- auto sc_mode = cfg::get().pe_params.param_set.sm;
- if (sc_mode == sm_old_pe_2015 || sc_mode == sm_2015 || sc_mode == sm_combined) {
+ if (used_storage_->UniqueCheckEnabled()) {
bool was_used = false;
for (size_t ind =0; ind < paths.Get(i)->Size(); ind++) {
EdgeId eid = paths.Get(i)->At(ind);
@@ -963,7 +982,7 @@ private:
GrowPath(*path, &result);
GrowPath(*conjugatePath, &result);
} while (count_trying < 10 && (path->Length() != current_path_len));
- path->CheckConjugateEnd(cfg::get().max_repeat_length);
+ path->CheckConjugateEnd(max_repeat_len_);
DEBUG("result path " << path->GetId());
path->Print();
}
@@ -978,7 +997,7 @@ class LoopDetectingPathExtender : public PathExtender {
protected:
size_t maxLoops_;
- bool investigateShortLoops_;
+ bool investigate_short_loops_;
bool use_short_loop_cov_resolver_;
CovShortLoopResolver cov_loop_resolver_;
@@ -987,11 +1006,12 @@ protected:
const GraphCoverageMap& cov_map_;
public:
- LoopDetectingPathExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map, size_t max_loops, bool investigateShortLoops,
+ LoopDetectingPathExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map, size_t max_loops,
+ bool investigate_short_loops,
bool use_short_loop_cov_resolver, size_t is)
: PathExtender(gp.g),
maxLoops_(max_loops),
- investigateShortLoops_(investigateShortLoops),
+ investigate_short_loops_(investigate_short_loops),
use_short_loop_cov_resolver_(use_short_loop_cov_resolver),
cov_loop_resolver_(gp),
is_detector_(gp.g, cov_map, is),
@@ -1004,11 +1024,11 @@ public:
}
bool isInvestigateShortLoops() const {
- return investigateShortLoops_;
+ return investigate_short_loops_;
}
void setInvestigateShortLoops(bool investigateShortLoops) {
- this->investigateShortLoops_ = investigateShortLoops;
+ this->investigate_short_loops_ = investigateShortLoops;
}
void setMaxLoops(size_t maxLoops) {
@@ -1144,7 +1164,7 @@ private:
}
bool InvestigateShortLoop() {
- return investigateShortLoops_ && (use_short_loop_cov_resolver_ || CanInvestigateShortLoop());
+ return investigate_short_loops_ && (use_short_loop_cov_resolver_ || CanInvestigateShortLoop());
}
protected:
DECL_LOGGER("LoopDetectingPathExtender")
@@ -1240,7 +1260,7 @@ protected:
DEBUG(candidates.size())
if (candidates.size() == 1) {
LoopDetector loop_detector(&path, cov_map_);
- if (!investigateShortLoops_ && (loop_detector.EdgeInShortLoop(path.Back()) or loop_detector.EdgeInShortLoop(candidates.back().e_))
+ if (!investigate_short_loops_ && (loop_detector.EdgeInShortLoop(path.Back()) or loop_detector.EdgeInShortLoop(candidates.back().e_))
&& extensionChooser_->WeightCounterBased()) {
return false;
}
@@ -1258,17 +1278,16 @@ protected:
LoopDetector loop_detector(&path, cov_map_);
DEBUG("loop detecor");
- if (!investigateShortLoops_ &&
+ if (!investigate_short_loops_ &&
(loop_detector.EdgeInShortLoop(path.Back()) or loop_detector.EdgeInShortLoop(candidates.back().e_))
&& extensionChooser_->WeightCounterBased()) {
return false;
}
DEBUG("push");
- auto sc_mode = cfg::get().pe_params.param_set.sm;
EdgeId eid = candidates.back().e_;
//In 2015 modes when trying to use already used unique edge, it is not added and path growing stops.
//That allows us to avoid overlap removal hacks used earlier.
- if (is_2015_scaffolder_enabled(sc_mode)) {
+ if (used_storage_->UniqueCheckEnabled()) {
if (used_storage_->IsUsedAndUnique(eid)) {
return false;
} else {
@@ -1303,15 +1322,38 @@ public:
protected:
virtual bool AddCandidates(BidirectionalPath& path, PathContainer* paths_storage, ExtensionChooser::EdgeContainer& candidates) override {
bool res = false;
- if (candidates.size() >= 1 && (max_candidates_ == 0 || candidates.size() <= max_candidates_)) {
+
+ if (candidates.size() == 1) {
+ LoopDetector loop_detector(&path, cov_map_);
+ DEBUG("loop detecor");
+ if (!investigate_short_loops_ &&
+ (loop_detector.EdgeInShortLoop(path.Back()) or loop_detector.EdgeInShortLoop(candidates.back().e_))
+ && extensionChooser_->WeightCounterBased()) {
+ return false;
+ }
+ DEBUG("push");
+ EdgeId eid = candidates.back().e_;
+ path.PushBack(eid, candidates.back().d_);
+ DEBUG("push done");
+ return true;
+ }
+ else if (candidates.size() == 2 && (max_candidates_ == 0 || candidates.size() <= max_candidates_)) {
+ //Check for bulge
+ auto v = g_.EdgeStart(candidates.front().e_);
+ auto u = g_.EdgeEnd(candidates.front().e_);
+ for (auto edge : candidates) {
+ if (v != g_.EdgeStart(edge.e_) || u != g_.EdgeEnd(edge.e_))
+ return false;
+ }
+
LoopDetector loop_detector(&path, cov_map_);
DEBUG("loop detector");
- if (!investigateShortLoops_ && loop_detector.EdgeInShortLoop(path.Back())
+ if (!investigate_short_loops_ && loop_detector.EdgeInShortLoop(path.Back())
&& extensionChooser_->WeightCounterBased()) {
return false;
}
//First candidate is adding to THIS path.
- else if (not (!investigateShortLoops_ && loop_detector.EdgeInShortLoop(candidates.front().e_)
+ else if (not (!investigate_short_loops_ && loop_detector.EdgeInShortLoop(candidates.front().e_)
&& extensionChooser_->WeightCounterBased())) {
DEBUG("push");
path.PushBack(candidates.front().e_, candidates.front().d_);
@@ -1323,7 +1365,7 @@ protected:
}
//Creating new paths for other than new candidate.
for (size_t i = 1; i < candidates.size(); ++i) {
- if (not (!investigateShortLoops_ && loop_detector.EdgeInShortLoop(candidates.front().e_)
+ if (not (!investigate_short_loops_ && loop_detector.EdgeInShortLoop(candidates.front().e_)
&& extensionChooser_->WeightCounterBased())) {
BidirectionalPath *p = new BidirectionalPath(path);
p->PushBack(candidates[i].e_, candidates[i].d_);
@@ -1343,9 +1385,11 @@ protected:
class ScaffoldingPathExtender: public LoopDetectingPathExtender {
+private:
std::shared_ptr<ExtensionChooser> extension_chooser_;
ExtensionChooser::EdgeContainer sources_;
std::shared_ptr<GapJoiner> gap_joiner_;
+ bool avoid_rc_connections_;
//When check_sink_ set to false we can scaffold not only tips
bool check_sink_;
@@ -1364,67 +1408,71 @@ class ScaffoldingPathExtender: public LoopDetectingPathExtender {
return g_.OutgoingEdgeCount(g_.EdgeEnd(e)) == 0;
}
-
-public:
-
- ScaffoldingPathExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map, std::shared_ptr<ExtensionChooser> extension_chooser,
- std::shared_ptr<GapJoiner> gap_joiner, size_t is, size_t max_loops, bool investigateShortLoops, bool check_sink = true):
- LoopDetectingPathExtender(gp, cov_map, max_loops, investigateShortLoops, false, is),
- extension_chooser_(extension_chooser),
- gap_joiner_(gap_joiner),check_sink_(check_sink)
- {
- InitSources();
+protected:
+ virtual bool GapSatisfies(int /*gap*/) const {
+ return true;
}
- bool MakeSimpleGrowStep(BidirectionalPath& path, PathContainer* /*paths_storage*/) override {
- if (path.Size() < 1 || (check_sink_ && !IsSink(path.Back())) ) {
+ bool MakeSimpleGrowStepForChooser(BidirectionalPath& path, std::shared_ptr<ExtensionChooser> ec, bool must_overlap = false) {
+ if (path.Size() < 1 || (check_sink_ && !IsSink(path.Back()))) {
return false;
}
DEBUG("scaffolding:");
DEBUG("Simple grow step, growing path");
path.Print();
- ExtensionChooser::EdgeContainer candidates = extension_chooser_->Filter(path, sources_);
+ ExtensionChooser::EdgeContainer candidates = ec->Filter(path, sources_);
DEBUG("scaffolding candidates " << candidates.size() << " from sources " << sources_.size());
+ //DEBUG("Extension chooser threshold = " << ec->GetThreshold())
+ DEBUG("Candidate size = " << candidates.size())
if (candidates.size() == 1) {
- if (candidates[0].e_ == path.Back() || (cfg::get().avoid_rc_connections && candidates[0].e_ == g_.conjugate(path.Back()))) {
+ if (candidates[0].e_ == path.Back()
+ || (avoid_rc_connections_ && candidates[0].e_ == g_.conjugate(path.Back()))) {
return false;
}
BidirectionalPath temp_path(path);
temp_path.PushBack(candidates[0].e_);
- if(this->DetectCycleScaffolding(temp_path)) {
+ if (this->DetectCycleScaffolding(temp_path)) {
return false;
}
- auto sc_mode = cfg::get().pe_params.param_set.sm;
EdgeId eid = candidates.back().e_;
- if(cfg::get().pe_params.param_set.scaffolder_options.fix_gaps && check_sink_) {
+ if (check_sink_) {
Gap gap = gap_joiner_->FixGap(path.Back(), candidates.back().e_, candidates.back().d_);
+ DEBUG("Gap after fixing " << gap.gap_ << " (was " << candidates.back().d_ << ")");
if (gap.gap_ != GapJoiner::INVALID_GAP) {
DEBUG("Scaffolding. PathId: " << path.GetId() << " path length: " << path.Length() <<
- ", fixed gap length: " << gap.gap_ << ", trash length: " << gap.trash_previous_ << "-" <<
- gap.trash_current_);
+ ", fixed gap length: " << gap.gap_ << ", trash length: " << gap.trash_previous_ << "-" <<
+ gap.trash_current_);
- if (is_2015_scaffolder_enabled(sc_mode)) {
+ if (used_storage_->UniqueCheckEnabled()) {
if (used_storage_->IsUsedAndUnique(eid)) {
return false;
} else {
used_storage_->insert(eid);
}
}
+
+ if (must_overlap && GapSatisfies(gap.gap_)) {
+ DEBUG("Overlap is not large enogh")
+ return false;
+ }
+ DEBUG("Overlap is good, success")
path.PushBack(eid, gap);
return true;
}
else {
DEBUG("Looks like wrong scaffolding. PathId: " << path.GetId() << " path length: " <<
- path.Length() << ", fixed gap length: " << candidates.back().d_);
+ path.Length() << ", fixed gap length: " << candidates.back().d_ << ", fixed = " << gap.gap_);
return false;
}
}
else {
DEBUG("Gap joiners off");
- DEBUG("Scaffolding. PathId: " << path.GetId() << " path length: " << path.Length() << ", fixed gap length: " << candidates.back().d_ );
- if (is_2015_scaffolder_enabled(sc_mode)) {
+ DEBUG("Scaffolding. PathId: " << path.GetId() << " path length: " << path.Length()
+ << ", fixed gap length: " << candidates.back().d_);
+
+ if (used_storage_->UniqueCheckEnabled()) {
if (used_storage_->IsUsedAndUnique(eid)) {
return false;
} else {
@@ -1439,6 +1487,30 @@ public:
return false;
}
+public:
+
+ ScaffoldingPathExtender(const conj_graph_pack& gp,
+ const GraphCoverageMap& cov_map,
+ std::shared_ptr<ExtensionChooser> extension_chooser,
+ std::shared_ptr<GapJoiner> gap_joiner,
+ size_t is,
+ size_t max_loops,
+ bool investigate_short_loops,
+ bool avoid_rc_connections,
+ bool check_sink = true):
+ LoopDetectingPathExtender(gp, cov_map, max_loops, investigate_short_loops, false, is),
+ extension_chooser_(extension_chooser),
+ gap_joiner_(gap_joiner),
+ avoid_rc_connections_(avoid_rc_connections),
+ check_sink_(check_sink)
+ {
+ InitSources();
+ }
+
+ bool MakeSimpleGrowStep(BidirectionalPath& path, PathContainer* /*paths_storage*/) override {
+ return MakeSimpleGrowStepForChooser(path, extension_chooser_);
+ }
+
bool ResolveShortLoopByCov(BidirectionalPath&) override {
return false;
}
@@ -1451,8 +1523,39 @@ public:
return extension_chooser_;
}
-private:
+protected:
DECL_LOGGER("ScaffoldingPathExtender");
};
+
+class RNAScaffoldingPathExtender: public ScaffoldingPathExtender {
+ std::shared_ptr<ExtensionChooser> strict_extension_chooser_;
+
+ int min_overlap_;
+
+protected:
+ bool GapSatisfies(int gap) const override {
+ return gap > (int) g_.k() - min_overlap_;
+ }
+
+public:
+
+ RNAScaffoldingPathExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map, std::shared_ptr<ExtensionChooser> extension_chooser,
+ std::shared_ptr<ExtensionChooser> strict_extension_chooser,
+ std::shared_ptr<GapJoiner> gap_joiner,
+ size_t is,
+ size_t max_loops,
+ bool investigate_short_loops,
+ int min_overlap = 0):
+ ScaffoldingPathExtender(gp, cov_map, extension_chooser, gap_joiner, is, max_loops, investigate_short_loops, true),
+ strict_extension_chooser_(strict_extension_chooser), min_overlap_(min_overlap) {}
+
+
+ bool MakeSimpleGrowStep(BidirectionalPath& path, PathContainer* /*paths_storage*/) override {
+ return MakeSimpleGrowStepForChooser(path, GetExtensionChooser(), true) ||
+ MakeSimpleGrowStepForChooser(path, strict_extension_chooser_);
+ }
+
+};
+
}
diff --git a/src/modules/algorithms/path_extend/path_filter.hpp b/src/modules/algorithms/path_extend/path_filter.hpp
index 35f78c2..fa19ce9 100644
--- a/src/modules/algorithms/path_extend/path_filter.hpp
+++ b/src/modules/algorithms/path_extend/path_filter.hpp
@@ -30,15 +30,11 @@ public:
virtual bool predicate(BidirectionalPath& path) = 0;
- virtual bool conjugateOperator(bool p, bool cp) {
- return p || cp;
- }
-
PathContainer filter(PathContainer& paths) {
PathContainer result;
for (size_t i = 0; i < paths.size(); ++i) {
- if (conjugateOperator(predicate(*paths.Get(i)), predicate(*paths.GetConjugate(i)))) {
+ if (predicate(*paths.Get(i)) || predicate(*paths.GetConjugate(i))) {
result.AddPair(paths.Get(i), paths.GetConjugate(i));
}
}
@@ -76,13 +72,9 @@ public:
virtual bool predicate(BidirectionalPath& path) = 0;
- virtual bool conjugateOperator(bool p, bool cp) {
- return p && cp;
- }
-
void filter(PathContainer& paths) {
for (PathContainer::Iterator iter = paths.begin(); iter != paths.end(); ) {
- if (!conjugateOperator(predicate(*iter.get()), predicate(*iter.getConjugate()))) {
+ if (predicate(*iter.get()) || predicate(*iter.getConjugate())) {
iter = paths.erase(iter);
}
else {
@@ -107,10 +99,10 @@ public:
virtual bool predicate(BidirectionalPath& path) {
for (size_t i = 0; i < path.Size(); ++i) {
if (math::ls(g.coverage(path[i]), minCoverage)) {
- return false;
+ return true;
}
}
- return true;
+ return false;
}
};
@@ -125,7 +117,39 @@ public:
}
virtual bool predicate(BidirectionalPath& path) {
- return path.Length() > minLength;
+ return path.Length() <= minLength;
+ }
+};
+
+
+class IsolatedPathFilter: public ErasingPathFilter {
+
+protected:
+ size_t min_length_;
+
+ double min_cov_;
+
+public:
+ IsolatedPathFilter(const Graph& g_, size_t min_length, double min_cov = 10000000.0):
+ ErasingPathFilter(g_),
+ min_length_(min_length),
+ min_cov_(min_cov) {
+ }
+
+ virtual bool predicate(BidirectionalPath& path) {
+ if (path.Empty())
+ return true;
+
+ if (path.Size() <= 2) {
+ auto v1 = g.EdgeStart(path.Front());
+ auto v2 = g.EdgeEnd(path.Back());
+
+ return g.IncomingEdgeCount(v1) == 0 &&
+ g.OutgoingEdgeCount(v2) == 0 &&
+ path.Length() < min_length_ &&
+ math::ls(path.Coverage(), min_cov_);
+ }
+ return false;
}
};
diff --git a/src/modules/algorithms/path_extend/pe_config_struct.cpp b/src/modules/algorithms/path_extend/pe_config_struct.cpp
index 5f1d5b5..1acab7c 100644
--- a/src/modules/algorithms/path_extend/pe_config_struct.cpp
+++ b/src/modules/algorithms/path_extend/pe_config_struct.cpp
@@ -76,26 +76,49 @@ void load(pe_config::ParamSetT::CoordinatedCoverageT& coord_cov,
void load(pe_config::ParamSetT::ScaffolderOptionsT& so,
boost::property_tree::ptree const& pt, bool complete)
{
- using config_common::load;
- load(so.on , pt, "on" , complete);
- load(so.cutoff , pt, "cutoff", complete);
- load(so.rel_cutoff , pt, "rel_cutoff", complete);
- load(so.sum_threshold , pt, "sum_threshold", complete);
-
- load(so.cluster_info , pt, "cluster_info", complete);
- load(so.cl_threshold , pt, "cl_threshold", complete);
-
- load(so.fix_gaps , pt, "fix_gaps", complete);
- load(so.use_la_gap_joiner , pt, "use_la_gap_joiner", complete);
- load(so.min_gap_score , pt, "min_gap_score", complete);
- load(so.max_must_overlap , pt, "max_must_overlap", complete);
- load(so.max_can_overlap , pt, "max_can_overlap", complete);
- load(so.short_overlap , pt, "short_overlap", complete);
- load(so.artificial_gap , pt, "artificial_gap", complete);
- load(so.use_old_score , pt, "use_old_score", complete);
- load(so.min_overlap_length, pt, "min_overlap_length", complete);
- load(so.flank_addition_coefficient, pt, "flank_addition_coefficient", complete);
- load(so.flank_multiplication_coefficient, pt, "flank_multiplication_coefficient", complete);
+ using config_common::load;
+ load(so.enabled, pt, "enabled" , complete);
+ load(so.cutoff , pt, "cutoff", complete);
+ load(so.hard_cutoff , pt, "hard_cutoff", complete);
+ load(so.rel_cutoff , pt, "rel_cutoff", complete);
+ load(so.sum_threshold , pt, "sum_threshold", complete);
+
+ load(so.cluster_info , pt, "cluster_info", complete);
+ load(so.cl_threshold , pt, "cl_threshold", complete);
+
+ load(so.use_la_gap_joiner , pt, "use_la_gap_joiner", complete);
+ load(so.min_gap_score , pt, "min_gap_score", complete);
+ load(so.max_must_overlap , pt, "max_must_overlap", complete);
+ load(so.max_can_overlap , pt, "max_can_overlap", complete);
+ load(so.short_overlap , pt, "short_overlap", complete);
+ load(so.artificial_gap , pt, "artificial_gap", complete);
+ load(so.use_old_score , pt, "use_old_score", complete);
+ load(so.min_overlap_length, pt, "min_overlap_length", complete);
+ load(so.flank_addition_coefficient, pt, "flank_addition_coefficient", complete);
+ load(so.flank_multiplication_coefficient, pt, "flank_multiplication_coefficient", complete);
+
+ load(so.var_coeff , pt, "var_coeff", complete);
+ load(so.basic_overlap_coeff, pt, "basic_overlap_coeff", complete);
+
+ if (pt.count("min_overlap_for_rna_scaffolding")) {
+ VERIFY_MSG(!so.min_overlap_for_rna_scaffolding, "Option can be loaded only once");
+ so.min_overlap_for_rna_scaffolding.reset(0);
+ load(*so.min_overlap_for_rna_scaffolding, pt, "min_overlap_for_rna_scaffolding");
+ }
+}
+
+
+void load(pe_config::ParamSetT::PathFiltrationT& pf,
+ boost::property_tree::ptree const& pt, bool complete)
+{
+ using config_common::load;
+ load(pf.enabled , pt, "enabled" , complete);
+ if (pf.enabled) {
+ load(pf.min_length , pt, "min_length" , complete);
+ load(pf.isolated_min_length , pt, "isolated_min_length" , complete);
+ load(pf.min_length_for_low_covered , pt, "min_length_for_low_covered" , complete);
+ load(pf.min_coverage , pt, "min_coverage" , complete);
+ }
}
void load(pe_config::ParamSetT& p, boost::property_tree::ptree const& pt, bool complete) {
@@ -114,6 +137,8 @@ void load(pe_config::ParamSetT& p, boost::property_tree::ptree const& pt, bool c
load(p.use_coordinated_coverage, pt, "use_coordinated_coverage", complete);
load(p.scaffolding2015, pt, "scaffolding2015", complete);
load(p.scaffold_graph_params, pt, "scaffold_graph", complete);
+ load(p.path_filtration, pt, "path_cleaning", complete);
+
}
@@ -133,6 +158,8 @@ void load(pe_config::ParamSetT::Scaffolding2015& p, boost::property_tree::ptree
load(p.autodetect, pt, "autodetect");
load(p.min_unique_length, pt, "min_unique_length");
load(p.unique_coverage_variation, pt, "unique_coverage_variation");
+ load(p.relative_weight_cutoff, pt, "relative_weight_cutoff");
+
}
void load(pe_config::AllLongReads& p, boost::property_tree::ptree const& pt,
diff --git a/src/modules/algorithms/path_extend/pe_config_struct.hpp b/src/modules/algorithms/path_extend/pe_config_struct.hpp
index 47578c7..620f7c8 100644
--- a/src/modules/algorithms/path_extend/pe_config_struct.hpp
+++ b/src/modules/algorithms/path_extend/pe_config_struct.hpp
@@ -40,209 +40,228 @@ enum scaffolding_mode {
sm_old_pe_2015
};
-inline bool is_2015_scaffolder_enabled(const scaffolding_mode mode) {
+inline bool IsScaffolder2015Enabled(const scaffolding_mode mode) {
return (mode == sm_old_pe_2015 || mode == sm_2015 || mode == sm_combined);
}
+inline bool IsOldPEEnabled(const scaffolding_mode mode) {
+ return (mode == sm_old_pe_2015 || mode == sm_old || mode == sm_combined);
+}
+
// struct for path extend subproject's configuration file
struct pe_config {
- typedef boost::bimap<std::string, output_broken_scaffolds> output_broken_scaffolds_id_mapping;
+ typedef boost::bimap<std::string, output_broken_scaffolds> output_broken_scaffolds_id_mapping;
- static const output_broken_scaffolds_id_mapping FillOBSInfo() {
- output_broken_scaffolds_id_mapping::value_type info[] = {
- output_broken_scaffolds_id_mapping::value_type("none", obs_none),
- output_broken_scaffolds_id_mapping::value_type("break_gaps", obs_break_gaps),
- output_broken_scaffolds_id_mapping::value_type("break_all", obs_break_all)
- };
+ static const output_broken_scaffolds_id_mapping FillOBSInfo() {
+ output_broken_scaffolds_id_mapping::value_type info[] = {
+ output_broken_scaffolds_id_mapping::value_type("none", obs_none),
+ output_broken_scaffolds_id_mapping::value_type("break_gaps", obs_break_gaps),
+ output_broken_scaffolds_id_mapping::value_type("break_all", obs_break_all)
+ };
- return output_broken_scaffolds_id_mapping(info, utils::array_end(info));
- }
+ return output_broken_scaffolds_id_mapping(info, utils::array_end(info));
+ }
- static const output_broken_scaffolds_id_mapping& output_broken_scaffolds_info() {
- static output_broken_scaffolds_id_mapping output_broken_scaffolds_info = FillOBSInfo();
- return output_broken_scaffolds_info;
- }
+ static const output_broken_scaffolds_id_mapping &output_broken_scaffolds_info() {
+ static output_broken_scaffolds_id_mapping output_broken_scaffolds_info = FillOBSInfo();
+ return output_broken_scaffolds_info;
+ }
- static const std::string& output_broken_scaffolds_name(output_broken_scaffolds obs) {
- auto it = output_broken_scaffolds_info().right.find(obs);
- VERIFY_MSG(it != output_broken_scaffolds_info().right.end(),
- "No name for output broken scaffolds mode id = " << obs);
+ static const std::string &output_broken_scaffolds_name(output_broken_scaffolds obs) {
+ auto it = output_broken_scaffolds_info().right.find(obs);
+ VERIFY_MSG(it != output_broken_scaffolds_info().right.end(),
+ "No name for output broken scaffolds mode id = " << obs);
- return it->second;
- }
+ return it->second;
+ }
- static output_broken_scaffolds output_broken_scaffolds_id(std::string name) {
- auto it = output_broken_scaffolds_info().left.find(name);
- VERIFY_MSG(it != output_broken_scaffolds_info().left.end(),
- "There is no output broken scaffolds mode with name = " << name);
+ static output_broken_scaffolds output_broken_scaffolds_id(std::string name) {
+ auto it = output_broken_scaffolds_info().left.find(name);
+ VERIFY_MSG(it != output_broken_scaffolds_info().left.end(),
+ "There is no output broken scaffolds mode with name = " << name);
- return it->second;
- }
+ return it->second;
+ }
- typedef boost::bimap<std::string, scaffolding_mode> scaffolding_mode_id_mapping;
+ typedef boost::bimap<std::string, scaffolding_mode> scaffolding_mode_id_mapping;
- static const scaffolding_mode_id_mapping FillSMInfo() {
- scaffolding_mode_id_mapping::value_type info[] = {
- scaffolding_mode_id_mapping::value_type("old", sm_old),
- scaffolding_mode_id_mapping::value_type("2015", sm_2015),
- scaffolding_mode_id_mapping::value_type("combined", sm_combined),
- scaffolding_mode_id_mapping::value_type("old_pe_2015", sm_old_pe_2015)
- };
+ static const scaffolding_mode_id_mapping FillSMInfo() {
+ scaffolding_mode_id_mapping::value_type info[] = {
+ scaffolding_mode_id_mapping::value_type("old", sm_old),
+ scaffolding_mode_id_mapping::value_type("2015", sm_2015),
+ scaffolding_mode_id_mapping::value_type("combined", sm_combined),
+ scaffolding_mode_id_mapping::value_type("old_pe_2015", sm_old_pe_2015)
+ };
- return scaffolding_mode_id_mapping(info, utils::array_end(info));
- }
+ return scaffolding_mode_id_mapping(info, utils::array_end(info));
+ }
- static const scaffolding_mode_id_mapping& scaffolding_mode_info() {
- static scaffolding_mode_id_mapping scaffolding_mode_info = FillSMInfo();
- return scaffolding_mode_info;
- }
+ static const scaffolding_mode_id_mapping &scaffolding_mode_info() {
+ static scaffolding_mode_id_mapping scaffolding_mode_info = FillSMInfo();
+ return scaffolding_mode_info;
+ }
- static const std::string& scaffolding_mode_name(scaffolding_mode sm) {
- auto it = scaffolding_mode_info().right.find(sm);
- VERIFY_MSG(it != scaffolding_mode_info().right.end(),
- "No name for scaffolding mode id = " << sm);
+ static const std::string &scaffolding_mode_name(scaffolding_mode sm) {
+ auto it = scaffolding_mode_info().right.find(sm);
+ VERIFY_MSG(it != scaffolding_mode_info().right.end(),
+ "No name for scaffolding mode id = " << sm);
- return it->second;
- }
+ return it->second;
+ }
- static scaffolding_mode scaffolding_mode_id(std::string name) {
- auto it = scaffolding_mode_info().left.find(name);
- VERIFY_MSG(it != scaffolding_mode_info().left.end(),
- "There is no scaffolding mode with name = " << name);
+ static scaffolding_mode scaffolding_mode_id(std::string name) {
+ auto it = scaffolding_mode_info().left.find(name);
+ VERIFY_MSG(it != scaffolding_mode_info().left.end(),
+ "There is no scaffolding mode with name = " << name);
- return it->second;
- }
+ return it->second;
+ }
- struct OutputParamsT {
- bool write_overlaped_paths;
- bool write_paths;
+ struct OutputParamsT {
+ bool write_overlaped_paths;
+ bool write_paths;
- void DisableAll() {
- write_overlaped_paths = false;
- write_paths = false;
- }
- };
+ void DisableAll() {
+ write_overlaped_paths = false;
+ write_paths = false;
+ }
+ };
+ struct VisualizeParamsT {
+ bool print_overlaped_paths;
+ bool print_paths;
- struct VisualizeParamsT {
- bool print_overlaped_paths;
- bool print_paths;
+ void DisableAll() {
+ print_overlaped_paths = false;
+ print_paths = false;
+ }
+ };
- void DisableAll() {
- print_overlaped_paths = false;
- print_paths = false;
- }
- };
-
- struct ParamSetT {
- scaffolding_mode sm;
-
- bool normalize_weight;
- size_t split_edge_length;
-
- bool multi_path_extend;
- bool remove_overlaps;
- bool cut_all_overlaps;
-
- struct ExtensionOptionsT {
- bool use_default_single_threshold;
- double single_threshold;
- double weight_threshold;
- double priority_coeff;
- size_t max_repeat_length;
- } extension_options;
-
- ExtensionOptionsT mate_pair_options;
-
-
- struct ScaffolderOptionsT {
- bool on;
- int cutoff;
- double rel_cutoff;
- double sum_threshold;
-
- bool cluster_info;
- double cl_threshold;
-
- bool fix_gaps;
- bool use_la_gap_joiner;
- double min_gap_score;
- double max_must_overlap;
- double max_can_overlap;
- int short_overlap;
- size_t artificial_gap;
-
- bool use_old_score;
-
- size_t min_overlap_length;
- double flank_addition_coefficient;
- double flank_multiplication_coefficient;
- } scaffolder_options;
-
-
- struct LoopRemovalT {
- size_t max_loops;
- size_t mp_max_loops;
- } loop_removal;
-
-
- bool use_coordinated_coverage;
-
- struct CoordinatedCoverageT {
- size_t max_edge_length_in_repeat;
- double delta;
- size_t min_path_len;
- } coordinated_coverage;
- struct Scaffolding2015 {
- bool autodetect;
- size_t min_unique_length;
- double unique_coverage_variation;
- } scaffolding2015;
- struct ScaffoldGraphParamsT {
- bool construct;
- bool output;
- size_t always_add;
- size_t never_add;
- double relative_threshold;
- bool graph_connectivity;
- size_t max_path_length;
- } scaffold_graph_params;
- };
-
- struct LongReads {
- double filtering;
- double weight_priority;
- double unique_edge_priority;
- size_t min_significant_overlap;
- };
-
- struct AllLongReads{
- LongReads single_reads;
- LongReads pacbio_reads;
- LongReads contigs;
- LongReads meta_contigs;
- };
-
-
- struct MainPEParamsT {
- output_broken_scaffolds obs;
-
- bool finalize_paths;
- bool debug_output;
- std::string etc_dir;
-
- OutputParamsT output;
- VisualizeParamsT viz;
- ParamSetT param_set;
- AllLongReads long_reads;
- }; // params;
+ struct ParamSetT {
+ scaffolding_mode sm;
+
+ bool normalize_weight;
+ size_t split_edge_length;
+
+ bool multi_path_extend;
+ bool remove_overlaps;
+ bool cut_all_overlaps;
+
+ struct ExtensionOptionsT {
+ bool use_default_single_threshold;
+ double single_threshold;
+ double weight_threshold;
+ double priority_coeff;
+ size_t max_repeat_length;
+ } extension_options;
+
+ ExtensionOptionsT mate_pair_options;
+
+
+ struct ScaffolderOptionsT {
+ bool enabled;
+ int cutoff;
+ int hard_cutoff;
+ double rel_cutoff;
+ double sum_threshold;
+
+ bool cluster_info;
+ double cl_threshold;
+
+ bool use_la_gap_joiner;
+ double min_gap_score;
+ double max_must_overlap;
+ double max_can_overlap;
+ int short_overlap;
+ size_t artificial_gap;
+
+ bool use_old_score;
+
+ double var_coeff;
+ double basic_overlap_coeff;
+
+ size_t min_overlap_length;
+ double flank_addition_coefficient;
+ double flank_multiplication_coefficient;
+
+ boost::optional<int> min_overlap_for_rna_scaffolding;
+ } scaffolder_options;
+
+
+ struct LoopRemovalT {
+ size_t max_loops;
+ size_t mp_max_loops;
+ } loop_removal;
+
+ struct PathFiltrationT {
+ bool enabled;
+ size_t min_length;
+ size_t isolated_min_length;
+ size_t min_length_for_low_covered;
+ double min_coverage;
+ } path_filtration;
+
+
+ bool use_coordinated_coverage;
+
+ struct CoordinatedCoverageT {
+ size_t max_edge_length_in_repeat;
+ double delta;
+ size_t min_path_len;
+ } coordinated_coverage;
+
+ struct Scaffolding2015 {
+ bool autodetect;
+ size_t min_unique_length;
+ double unique_coverage_variation;
+ double relative_weight_cutoff;
+ } scaffolding2015;
+
+ struct ScaffoldGraphParamsT {
+ bool construct;
+ bool output;
+ size_t always_add;
+ size_t never_add;
+ double relative_threshold;
+ bool graph_connectivity;
+ size_t max_path_length;
+ } scaffold_graph_params;
+ };
+
+ struct LongReads {
+ double filtering;
+ double weight_priority;
+ double unique_edge_priority;
+ size_t min_significant_overlap;
+ };
+
+ struct AllLongReads {
+ LongReads single_reads;
+ LongReads pacbio_reads;
+ LongReads contigs;
+ LongReads meta_contigs;
+ };
+
+
+ struct MainPEParamsT {
+ output_broken_scaffolds obs;
+
+ bool finalize_paths;
+ bool debug_output;
+ std::string etc_dir;
+
+ OutputParamsT output;
+ VisualizeParamsT viz;
+ ParamSetT param_set;
+ AllLongReads long_reads;
+ }; // params;
};
-void load(pe_config::ParamSetT& p, boost::property_tree::ptree const& pt, bool complete = true);
-void load(pe_config::MainPEParamsT& p, boost::property_tree::ptree const& pt, bool complete = true);
+void load(pe_config::ParamSetT &p, boost::property_tree::ptree const &pt, bool complete = true);
+void load(pe_config::MainPEParamsT &p, boost::property_tree::ptree const &pt, bool complete = true);
//void load(pe_config& pe_cfg, boost::property_tree::ptree const& pt, bool complete);
}
diff --git a/src/modules/algorithms/path_extend/pe_io.hpp b/src/modules/algorithms/path_extend/pe_io.hpp
index 4aa9ffd..a31623c 100644
--- a/src/modules/algorithms/path_extend/pe_io.hpp
+++ b/src/modules/algorithms/path_extend/pe_io.hpp
@@ -31,6 +31,8 @@ protected:
size_t k_;
map<EdgeId, ExtendedContigIdT> ids_;
const ConnectedComponentCounter &c_counter_;
+ bool plasmid_contig_naming_;
+
//TODO: add constructor
string ToString(const BidirectionalPath& path) const {
stringstream ss;
@@ -108,7 +110,14 @@ protected:
public:
- ContigWriter(const Graph& g, ContigConstructor<Graph> &constructor, const ConnectedComponentCounter &c_counter): g_(g), constructor_(constructor), k_(g.k()), ids_(), c_counter_(c_counter) {
+ ContigWriter(const Graph& g,
+ ContigConstructor<Graph> &constructor,
+ const ConnectedComponentCounter &c_counter,
+ bool plasmid_contig_naming = false):
+ g_(g), constructor_(constructor), k_(g.k()),
+ ids_(), c_counter_(c_counter),
+ plasmid_contig_naming_(plasmid_contig_naming)
+ {
MakeContigIdMap(g_, ids_, c_counter, "NODE");
}
@@ -219,7 +228,7 @@ public:
path->Print();
string contig_id;
string path_string = ToString(*path);
- if (cfg::get().pd) {
+ if (plasmid_contig_naming_) {
EdgeId e = path->At(0);
size_t component = c_counter_.GetComponent(e);
contig_id = io::MakeContigComponentId(i, path_string.length(), path->Coverage(), component);
diff --git a/src/modules/algorithms/path_extend/pe_resolver.hpp b/src/modules/algorithms/path_extend/pe_resolver.hpp
index 9729c70..bc36993 100644
--- a/src/modules/algorithms/path_extend/pe_resolver.hpp
+++ b/src/modules/algorithms/path_extend/pe_resolver.hpp
@@ -459,9 +459,12 @@ public:
remover.RemoveSimilarPaths(paths, max_overlap, max_overlap, true, false, false, false, false);
}
- void removeOverlaps(PathContainer& paths, GraphCoverageMap& coverage_map, size_t min_edge_len, size_t max_path_diff, bool add_overlaps_begin) {
+ void removeOverlaps(PathContainer& paths, GraphCoverageMap& coverage_map,
+ size_t min_edge_len, size_t max_path_diff,
+ bool add_overlaps_begin,
+ bool cut_preudo_self_conjugate) {
SimpleOverlapRemover remover(g_, coverage_map);
- if (cfg::get().mode == config::pipeline_type::moleculo)
+ if (cut_preudo_self_conjugate)
remover.CutPseudoSelfConjugatePaths(paths);
//writer.WritePathsToFASTA(paths, output_dir + "/before.fasta");
//DEBUG("Removing subpaths");
diff --git a/src/modules/algorithms/path_extend/scaffolder2015/extension_chooser2015.hpp b/src/modules/algorithms/path_extend/scaffolder2015/extension_chooser2015.hpp
index 5afa91b..f4ba49c 100644
--- a/src/modules/algorithms/path_extend/scaffolder2015/extension_chooser2015.hpp
+++ b/src/modules/algorithms/path_extend/scaffolder2015/extension_chooser2015.hpp
@@ -29,12 +29,22 @@ protected:
EdgeContainer FindNextUniqueEdge(const EdgeId from) const;
DECL_LOGGER("ExtensionChooser2015")
public:
- ExtensionChooser2015(const Graph& g, shared_ptr<WeightCounter> wc, double is_scatter_coeff,
- const ScaffoldingUniqueEdgeStorage& unique_edges ,double relative_threshold, size_t lib_index):
- ScaffoldingExtensionChooser(g, wc, is_scatter_coeff), unique_edges_(unique_edges), relative_weight_threshold_(relative_threshold), paired_connection_condition_(g,
- wc->get_libptr(), lib_index,
-//TODO: constants are subject to reconsider
- 0), graph_connection_condition_(g, 2*unique_edges_.GetMinLength(), unique_edges), absolute_weight_threshold_(2), graph_connection_bonus_(2) {
+ ExtensionChooser2015(const Graph& g,
+ shared_ptr<WeightCounter> wc,
+ size_t lib_index,
+ const ScaffoldingUniqueEdgeStorage& unique_edges,
+ double cl_weight_threshold,
+ double is_scatter_coeff,
+ double relative_threshold):
+ //TODO: constants are subject to reconsider
+ ScaffoldingExtensionChooser(g, wc, cl_weight_threshold, is_scatter_coeff),
+ unique_edges_(unique_edges),
+ relative_weight_threshold_(relative_threshold),
+ paired_connection_condition_(g, wc->get_libptr(), lib_index, 0),
+ graph_connection_condition_(g, 2 * unique_edges_.GetMinLength(), unique_edges),
+ //TODO to congif!
+ absolute_weight_threshold_(2),
+ graph_connection_bonus_(2) {
INFO("ExtensionChooser2015 created");
}
/* @param edges are really not used and left for compatibility
diff --git a/src/modules/algorithms/path_extend/utils/CMakeLists.txt b/src/modules/algorithms/path_extend/utils/CMakeLists.txt
deleted file mode 100644
index 39d1e6f..0000000
--- a/src/modules/algorithms/path_extend/utils/CMakeLists.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-############################################################################
-# Copyright (c) 2015 Saint Petersburg State University
-# Copyright (c) 2011-2014 Saint Petersburg Academic University
-# All Rights Reserved
-# See file LICENSE for details.
-############################################################################
-
-project(utils CXX)
-
-add_executable(utils
- paired_info_checker.cpp)
-
-target_link_libraries(utils ${COMMON_LIBRARIES} input)
\ No newline at end of file
diff --git a/src/modules/algorithms/path_extend/utils/find_aligns.py b/src/modules/algorithms/path_extend/utils/find_aligns.py
deleted file mode 100644
index dfa90eb..0000000
--- a/src/modules/algorithms/path_extend/utils/find_aligns.py
+++ /dev/null
@@ -1,67 +0,0 @@
-############################################################################
-# Copyright (c) 2015 Saint Petersburg State University
-# Copyright (c) 2011-2014 Saint Petersburg Academic University
-# All Rights Reserved
-# See file LICENSE for details.
-############################################################################
-
-import sys
-
-if len(sys.argv) != 5:
- print "<input nucmer file> <begin pos> <end pos> <output>"
- exit(1)
-
-CONTIG = "CONTIG"
-UNALIGNED_CONTIG = "This contig is unaligned."
-REAL_ALIGN = "Real Alignment"
-TOTAL_ALIGN = "One align captures most of this contig"
-MISASSEMBl = "Exte"
-nucmer = open(sys.argv[1], "r")
-out = open(sys.argv[4], "w")
-begin_pos = int(sys.argv[2])
-end_pos = int(sys.argv[3])
-infos = []
-result_info = []
-for line in nucmer:
- if CONTIG in line:
- if True:
- between_poses = False
- first_pos = 100000000000000000000000
- important_info = "-----------------\n"
- for info in infos:
- if CONTIG in info:
- important_info += info.strip() + "\n"
- if "_388_" in info:
- print important_info
- elif REAL_ALIGN in info or TOTAL_ALIGN in info:
- which_align = info.split(":")[0].strip()
- lst = info.split(":")[1].split("|")
- gen_align = lst[0].strip().split(" ")
- contig_align = lst[1].strip().split(" ")
- gen_align_begin = int(gen_align[0])
- gen_align_end = int(gen_align[1])
- contig_align_begin = int(contig_align[0])
- contig_align_end = int(contig_align[1])
- first_pos = min(first_pos, gen_align_begin)
-
- if ((gen_align_begin >= begin_pos) and (gen_align_begin <= end_pos) ) or (
- (gen_align_end >= begin_pos)and(gen_align_end <= end_pos)):
- between_poses = True
- important_info += which_align + ":" + str(gen_align_begin) + " " + str(gen_align_end) + " | " + str(
- contig_align_begin) + " " + str(contig_align_end) + "\n"
- elif MISASSEMBl in info:
- important_info += info.strip() + "\n"
- #out.write(info)
- if between_poses:
- result_info.append((first_pos, important_info))
- #out.write(important_info)
- infos = [line]
- else:
- infos.append(line)
-result_info.sort()
-for info in result_info:
- out.write(info[1])
-out.write("-----------\n")
-
-out.close()
-
diff --git a/src/modules/algorithms/path_extend/utils/find_single_threshold.py b/src/modules/algorithms/path_extend/utils/find_single_threshold.py
deleted file mode 100644
index d53013e..0000000
--- a/src/modules/algorithms/path_extend/utils/find_single_threshold.py
+++ /dev/null
@@ -1,101 +0,0 @@
-############################################################################
-# Copyright (c) 2015 Saint Petersburg State University
-# Copyright (c) 2011-2014 Saint Petersburg Academic University
-# All Rights Reserved
-# See file LICENSE for details.
-############################################################################
-
-import sys
-import numpy as np
-import matplotlib
-import matplotlib.mlab as mlab
-import matplotlib.pyplot as plt
-if (len(sys.argv) != 2):
- print ("<all_weights>")
- exit(1)
-
-fin = open(sys.argv[1]);
-Pattern_good = "good "
-lst_good = []
-Pattern_bad = "bad "
-lst_bad = []
-good_size = 1
-bad_size = 1
-for line in fin:
- if "good pi" in line:
- lst = line.split("good pi")[1].strip().split(" ")
- good_size = float(lst[0])
- bad_size = float(lst[-1])
- elif not "good pi" in line and (Pattern_good in line or Pattern_bad in line):
- Pattern = Pattern_good
- lst = lst_good
- if Pattern_bad in line:
- Pattern = Pattern_bad
- lst = lst_bad
- text = line.split(Pattern)[1].strip()
- params = text.split(" ")
- if (float(params[0]) >= 0 and float(params[-1]) >=0 and
- float(params[-1]) != float('nan')):
- lst.append(float(params[0]))
-
-def commulate( com, lst, begin, t):
- i = begin
- while i < len(lst) and lst[i] < t:
- com += 1
- i += 1
- return (i, com)
-matplotlib.rcParams.update({'font.size': 18})
-def plot_lines(ax, name, lst_g, lst_b, min_x, max_x, nbins):
- lst1 = sorted([x for x in lst_g if x > 0.0], key = lambda x:x)
- lst2 = sorted([x for x in lst_b if x > 0.0], key = lambda x:x)
- params = []
- i = min_x
- while i < max_x:
- params.append(i)
- i += float((max_x - min_x))/nbins
- good = []
- bad = []
- begin = 0
- com = 0
- begin2 = 0
- com2 = 0
- for param in params:
- (begin, com) = commulate( com, lst1, begin, param)
- if len(lst1) == 0:
- good.append(0)
- else:
- good.append(float(com) / float(len(lst1)))
- (begin2, com2) = commulate(com2, lst2, begin2, param)
- if len(lst2) == 0:
- bad.append(0)
- else:
- bad.append(1- float(com2)/ float(len(lst2)))
- ax.plot(params, good, params, bad)
- plt.xlabel(u"\u03A8", fontsize=24)
- plt.ylabel('FP/FN rate', fontsize = 24)
- # a.xaxis.label.set_fontsize(40)
- #a.ylabel.label.set_fontsize(40)
- #ax.set_title(name)
-"""max_range = 50
-count_bins = 200
-min_x_norm = 1.0
-h = np.histogram([x[1] for x in lst_good if x[1] > min_x_norm], bins = count_bins, range = (min_x_norm,max_range))
-widths = np.diff(h[1])
-good_l = len([x[1] for x in lst_good if x[1] > min_x_norm])
-plt.bar(h[1][:-1], h[0]/float(good_l), widths, alpha = 0.5, color = "red")
-hist, bins = np.histogram([x[1] for x in lst_bad if x[1] > min_x_norm], bins = count_bins, range = (min_x_norm,max_range))
-widths = np.diff(bins)
-bad_l = len([x[1] for x in lst_bad if x[1] > min_x_norm])
-plt.bar(bins[:-1], hist/float(bad_l), widths, alpha = 0.5, color = "blue")
-plt.show()
-"""
-f, axarr = plt.subplots(1, 1)
-plot_lines(axarr, "plot", lst_good, lst_bad, 0.00, 0.8, 5000)
-#plot_lines(axarr[0, 1], "w", 1, lst_good, lst_bad, 0, 10, 200)
-#plot_lines(axarr[0, 2], "(w / ideal_pi) / min(pi1, pi2)", 3, lst_good, lst_bad,
-#0.000001, 0.05, 500)
-#plot_lines(axarr[1, 0], "(w / ideal_pi) / min(cov1, cov2)", 5, lst_good, lst_bad, 0.0, 0.001, 500)
-#plot_lines(axarr[1, 1], "(w / ideal_pi) / min(pi_norm1, pi_norm2)", 2, lst_good,
-#lst_bad, 0.0, 0.05, 500)
-#plot_lines(axarr[1, 2], "test1 / min(pi_norm1_aver, pi_norm2_aver)", 6, lst_good, lst_bad, 0.0, 3, 500)
-plt.show()
diff --git a/src/modules/algorithms/path_extend/utils/paired_info_checker.cpp b/src/modules/algorithms/path_extend/utils/paired_info_checker.cpp
deleted file mode 100644
index f77046a..0000000
--- a/src/modules/algorithms/path_extend/utils/paired_info_checker.cpp
+++ /dev/null
@@ -1,204 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * paired_info_checker.cpp
- *
- * Created on: Sep 26, 2011
- * Author: andrey
- */
-
-#include "../lc_common.hpp"
-#include "../lc_io.hpp"
-
-using namespace debruijn_graph;
-
-class PairedInfoChecker {
-private:
- Graph& g_;
-
-public:
- PairedInfoChecker(Graph& g) : g_(g) {
-
- }
-
- bool IsSymmetric(PairedInfoIndex<Graph>& index) {
- bool result = true;
- for (auto iter = index.begin(); iter != index.end(); ++iter) {
- auto pi = *iter;
- if (pi.size() == 0) {
- continue;
- }
- EdgeId e1 = pi.back().first;
- EdgeId e2 = pi.back().second;
-
- auto sym_pi = index.GetEdgePairInfo(e2, e1);
-
- for (auto i1 = pi.begin(); i1 != pi.end(); ++i1) {
- for (auto i2 = sym_pi.begin(); i2 != sym_pi.end(); ++i2) {
- if (math::eq(i1->d, - i2->d) && !math::eq(i1->weight, i2->weight)) {
- INFO("No symmetric found ");
- result = false;
- }
- }
- }
-
- }
- return result;
- }
-
- bool IsConjugateSymmetric(PairedInfoIndex<Graph>& index) {
- bool result = true;
- for (auto iter = index.begin(); iter != index.end(); ++iter) {
- auto pi = *iter;
- if (pi.size() == 0) {
- continue;
- }
- EdgeId e1 = pi.back().first;
- EdgeId e2 = pi.back().second;
-
- auto conj_pi = index.GetEdgePairInfo(g_.conjugate(e1), g_.conjugate(e2));
-
- for (auto i1 = pi.begin(); i1 != pi.end(); ++i1) {
- for (auto i2 = conj_pi.begin(); i2 != conj_pi.end(); ++i2) {
- double new_d = i1->d - g_.length(e1) + g_.length(e2);
- if (math::eq(i1->d, - new_d) && !math::eq(i1->weight, i2->weight)) {
- INFO("No conjugate found ");
- result = false;
- }
- }
- }
-
- }
- return result;
- }
-
- bool AreEqual(PairedInfoIndex<Graph>& index1, PairedInfoIndex<Graph>& index2) {
- bool result = true;
- for (auto iter = index1.begin(); iter != index1.end(); ++iter) {
- auto pi = *iter;
- if (pi.size() == 0) {
- continue;
- }
- EdgeId e1 = pi.back().first;
- EdgeId e2 = pi.back().second;
-
- auto pi2 = index2.GetEdgePairInfo(e1, e2);
-
- for (auto i1 = pi.begin(); i1 != pi.end(); ++i1) {
- for (auto i2 = pi2.begin(); i2 != pi2.end(); ++i2) {
- if (math::eq(i1->d, i2->d) && !math::eq(i1->weight, i2->weight)) {
- INFO("Unequal weights");
- result = false;
- }
- }
- }
-
- }
- return result;
- }
-
- void AggregatePairedInfo(PairedInfoIndex<Graph>& clustered, PairedInfoIndex<Graph>& advanced,
- size_t insert_size, size_t read_length,
- PairedInfoIndex<Graph>* result) {
-
- PairedInfoWeightNormalizer<Graph> normalizer(g_, insert_size, read_length, K);
-
- for (auto iter = clustered.begin(); iter != clustered.end(); ++iter) {
- auto pi = *iter;
- if (pi.size() == 0) {
- continue;
- }
-
- EdgeId e1 = pi.back().first;
- EdgeId e2 = pi.back().second;
-
- auto pi2 = advanced.GetEdgePairInfo(e1, e2);
-
- for (auto i1 = pi.begin(); i1 != pi.end(); ++i1) {
-
- auto norm_pi = normalizer.NormalizeWeight(*i1);
-
- for (auto i2 = pi2.begin(); i2 != pi2.end(); ++i2) {
- if (math::ge(i1->d, i2->d - lc_cfg::get().u.dev) && math::le(i1->d, i2->d + lc_cfg::get().u.dev) && math::gr(i2->weight, 0.0)) {
- norm_pi.weight *= lc_cfg::get().es.advanced_coeff;
- }
- }
-
- result->AddPairInfo(norm_pi, false);
- }
-
- }
-
- }
-
-};
-
-
-int main() {
- cfg::create_instance(cfg_filename);
- lc_cfg::create_instance(long_contigs::lc_cfg_filename);
-
- Graph g(K);
- EdgeIndex<K + 1, Graph> index(g);
- PairedInfoIndex<Graph> pairedIndex(g, 0);
- KmerMapper<K+1, Graph> mapper(g);
- Sequence sequence("");
-
- long_contigs::LoadFromFile(lc_cfg::get().ds.graph_file, &g, sequence, &mapper);
- PairedInfoChecker checker(g);
-
- DataScanner<Graph> dataScanner(g);
-
- switch (lc_cfg::get().u.mode) {
- case 1: {
- INFO("Checking " << lc_cfg::get().u.file1);
- dataScanner.loadPaired(lc_cfg::get().u.file1, pairedIndex);
- INFO("Symmetric: " << checker.IsSymmetric(pairedIndex));
- INFO("Conjugate symmetric: " << checker.IsConjugateSymmetric(pairedIndex));
- break;
- }
- case 2: {
- PairedInfoIndex<Graph> pairedIndex2(g, 0);
- dataScanner.loadPaired(lc_cfg::get().u.file1, pairedIndex);
- dataScanner.loadPaired(lc_cfg::get().u.file2, pairedIndex2);
-
- INFO("Checking " << lc_cfg::get().u.file1 << " and " << lc_cfg::get().u.file2);
- INFO("1 is subset of 2 " << checker.AreEqual(pairedIndex, pairedIndex2));
- INFO("2 is subset of 1 " << checker.AreEqual(pairedIndex2, pairedIndex));
- break;
- }
- case 3: {
- INFO("Aggregating paired info");
-
- PairedInfoIndex<Graph> cl(g, 0);
- PairedInfoIndex<Graph> ad(g, 0);
- PairedInfoIndex<Graph> res(g, 0);
-
- dataScanner.loadPaired(lc_cfg::get().u.clustered, cl);
- dataScanner.loadPaired(lc_cfg::get().u.advanced, ad);
-
- checker.AggregatePairedInfo(cl, ad,
- lc_cfg::get().u.insert_size, lc_cfg::get().u.read_size,
- &res);
-
- DataPrinter<Graph> dataPrinter(g);
- dataPrinter.savePaired( "./" + lc_cfg::get().paired_info_file_prefix + "IS" + ToString(lc_cfg::get().u.insert_size) + "_RS" + ToString(lc_cfg::get().u.read_size)
- + "_agregate_" + ToString(lc_cfg::get().es.advanced_coeff), res);
-
- INFO("Done");
- break;
-
- }
- default: {
- INFO("Unknown mode");
- }
- }
-
- return 0;
-}
-
diff --git a/src/modules/algorithms/path_extend/utils/run_all_parametrs.py b/src/modules/algorithms/path_extend/utils/run_all_parametrs.py
deleted file mode 100644
index 0fc7ade..0000000
--- a/src/modules/algorithms/path_extend/utils/run_all_parametrs.py
+++ /dev/null
@@ -1,47 +0,0 @@
-############################################################################
-# Copyright (c) 2015 Saint Petersburg State University
-# Copyright (c) 2011-2014 Saint Petersburg Academic University
-# All Rights Reserved
-# See file LICENSE for details.
-############################################################################
-
-import sys
-import os
-
-if len(sys.argv) <4:
- print ("<pe_param_file> <out_file> <result_file>")
- exit(1)
-
-out_file = sys.argv[2]
-result_file = sys.argv[3]
-
-lst = [0.0, 0.0001, 0.0008, 0.001, 0.002, 0.003, 0.004, 0.005, 0.008, 0.01, 0.015, 0.02, 0.04, 0.05, 0.1, 0.15, 0.2, 0.4, 0.5]
-lst1 = [0.0001, 0.0008, 0.001, 0.005, 0.01, 0.05, 0.08, 0.1, 0.2]
-lst2 = [ 0.05, 0.08, 0.1,0.15, 0.2, 0.3]
-lst1 = [0.06, 0.1, 0.14]
-lst1 = [0.001, 0.005]
-lst2 = [0.1, 0.13, 0.15, 0.2]
-lst1 = [-1.0]
-for a in lst:
- # for b in lst:
- pe_params = open(sys.argv[1])
- pe_params_new = open("temp.txt", "w")
- line_index = 1
- for line in pe_params:
- new_line = line.strip() + "\n"
- if line_index == 72:
- new_line = "single_threshold " + str(a) + "\n"
- #elif line_index == 87:
- # new_line = "single_threshold " + str(b) + "\n"
- pe_params_new.write(new_line)
- line_index +=1
- print "change lines"
- pe_params_new.close()
- pe_params.close()
- os.rename("temp.txt", sys.argv[1])
- print "rename file"
- os.system("./run > log_" + str(a) + ".txt")#"_" + str(b) + ".txt")
- print "run program"
- os.system("cp -r " + out_file + " " + result_file + "_" + str(a) + ".fasta")#"_"+str(b) + ".fasta")
- print "copy file"+ out_file + " " + result_file + "_" + str(a) +".fasta"# "_"+str(b) + ".fasta"
-
diff --git a/src/modules/algorithms/simplification/erroneous_connection_remover.hpp b/src/modules/algorithms/simplification/erroneous_connection_remover.hpp
index 937baaa..c755d19 100644
--- a/src/modules/algorithms/simplification/erroneous_connection_remover.hpp
+++ b/src/modules/algorithms/simplification/erroneous_connection_remover.hpp
@@ -30,6 +30,129 @@ NecessaryECCondition(const Graph& g, size_t max_length, double max_coverage) {
CoverageUpperBound<Graph>(g, max_coverage)));
}
+
+template<class Graph>
+class RelativeCoverageECCondition: public EdgeCondition<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef EdgeCondition<Graph> base;
+
+ const double rcec_ratio_;
+
+ template<class ContainerType>
+ double SumCompetitorCoverage(EdgeId ec_edge, const ContainerType& edges) const {
+ const Graph &g = this->g();
+ double sum = 0;
+ for (EdgeId e : edges) {
+ //update if competitor edge is not loop
+ if (e != ec_edge && g.EdgeStart(e) != g.EdgeEnd(e))
+ sum += g.coverage(e);
+ }
+ return sum;
+ }
+
+ double AvgLocalityCoverage(EdgeId ec_edge) const {
+ const Graph &g = this->g();
+ VertexId start = g.EdgeStart(ec_edge), end = g.EdgeEnd(ec_edge);
+ auto in_start = g.IncomingEdges(start);
+ auto out_start = g.OutgoingEdges(start);
+ auto in_end = g.IncomingEdges(end);
+ auto out_end = g.OutgoingEdges(end);
+ double total_edges = double(g.IncomingEdgeCount(start) + g.OutgoingEdgeCount(start) +
+ g.IncomingEdgeCount(end) + g.OutgoingEdgeCount(end) - 2);
+ return (SumCompetitorCoverage(ec_edge, in_start) +
+ SumCompetitorCoverage(ec_edge, out_start) +
+ SumCompetitorCoverage(ec_edge, in_end) +
+ SumCompetitorCoverage(ec_edge, out_end)) / total_edges;
+ }
+
+ template<class ContainerType>
+ double MaxCompetitorCoverage(EdgeId ec_edge, const ContainerType& edges) const {
+ const Graph &g = this->g();
+ double result = 0;
+ for (EdgeId e : edges) {
+ //update if competitor edge is not loop
+ if (e != ec_edge && g.EdgeStart(e) != g.EdgeEnd(e))
+ result = std::max(result, g.coverage(e));
+ }
+ return result;
+ }
+
+ double MaxCompetitorCoverage(EdgeId ec_edge) const {
+ const Graph &g = this->g();
+ VertexId start = g.EdgeStart(ec_edge), end = g.EdgeEnd(ec_edge);
+ auto in_start = g.IncomingEdges(start);
+ auto out_start = g.OutgoingEdges(start);
+ auto in_end = g.IncomingEdges(end);
+ auto out_end = g.OutgoingEdges(end);
+ return std::max(
+ std::max(MaxCompetitorCoverage(ec_edge, in_start),
+ MaxCompetitorCoverage(ec_edge, out_start)),
+ std::max(MaxCompetitorCoverage(ec_edge, in_end),
+ MaxCompetitorCoverage(ec_edge, out_end)));
+ }
+
+public:
+
+ RelativeCoverageECCondition(const Graph& g, double rcec_ratio) :
+ base(g), rcec_ratio_(rcec_ratio) {
+ }
+
+ bool Check(EdgeId e) const override {
+ //+1 is a trick to deal with edges of 0 coverage from iterative run
+ double locality_coverage = AvgLocalityCoverage(e) + 1;
+ return math::le(this->g().coverage(e), rcec_ratio_ * locality_coverage);
+ }
+
+};
+
+template<class Graph>
+pred::TypedPredicate<typename Graph::EdgeId> AddRelativeCoverageECCondition(const Graph &g, double rcec_ratio,
+ pred::TypedPredicate<typename Graph::EdgeId> condition) {
+ return pred::And(RelativeCoverageECCondition<Graph>(g, rcec_ratio), condition);
+}
+
+template<class Graph>
+inline bool IsSimpleBulge(const Graph &g, typename Graph::EdgeId e){
+ size_t edge_count = g.GetEdgesBetween(g.EdgeStart(e), g.EdgeEnd(e)).size();
+
+ return edge_count == g.OutgoingEdgeCount(g.EdgeStart(e)) &&
+ edge_count == g.IncomingEdgeCount(g.EdgeEnd(e)) &&
+ edge_count >= 2;
+}
+
+template<class Graph>
+class NotBulgeECCondition : public EdgeCondition<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef EdgeCondition<Graph> base;
+
+public:
+
+ NotBulgeECCondition(const Graph &g)
+ : base(g) {
+
+ }
+
+ bool Check(EdgeId e) const {
+ if (HasAlternatives(this->g(), e) && !IsSimpleBulge(this->g(), e)){
+ DEBUG("edge id = " << this->g().int_id(e)
+ << " between = " << this->g().GetEdgesBetween(this->g().EdgeStart(e), this->g().EdgeEnd(e)).size()
+ << " between ids: " << this->g().GetEdgesBetween(this->g().EdgeStart(e), this->g().EdgeEnd(e))
+ << " outgoing s = " << this->g().OutgoingEdgeCount(this->g().EdgeStart(e))
+ << " incoming e = " << this->g().IncomingEdgeCount(this->g().EdgeEnd(e)));
+ }
+ return !IsSimpleBulge(this->g(), e);
+ }
+
+};
+
+template<class Graph>
+pred::TypedPredicate<typename Graph::EdgeId> AddNotBulgeECCondition(const Graph &g,
+ pred::TypedPredicate<typename Graph::EdgeId> condition) {
+ return pred::And(NotBulgeECCondition<Graph>(g), condition);
+}
+
template<class Graph>
bool RemoveErroneousEdgesInCoverageOrder(Graph &g,
pred::TypedPredicate<typename Graph::EdgeId> removal_condition,
diff --git a/src/modules/algorithms/simplification/tip_clipper.hpp b/src/modules/algorithms/simplification/tip_clipper.hpp
index 32951e7..a4b7db3 100644
--- a/src/modules/algorithms/simplification/tip_clipper.hpp
+++ b/src/modules/algorithms/simplification/tip_clipper.hpp
@@ -44,7 +44,7 @@ class RelativeCoverageTipCondition: public EdgeCondition<Graph> {
auto out = g.OutgoingEdges(start);
auto in = g.IncomingEdges(end);
return std::max(
- MaxCompetitorCoverage(tip, out.begin(), out.end()),
+ MaxCompetitorCoverage(tip, out.begin(), out.end()),
MaxCompetitorCoverage(tip, in.begin(), in.end()));
// return std::max(
// MaxCompetitorCoverage(tip, g.out_begin(start),
@@ -178,11 +178,11 @@ public:
counts[s_edge[position]] ++;
}
size_t curm = *std::max_element(counts.begin(), counts.end());
- if (curm > (end - start) * max_AT_percentage_) {
+ if (curm > max_AT_percentage_ * double(end - start)) {
DEBUG("deleting edge" << s_edge.str());;
DEBUG("curm: " << curm);
- DEBUG("start end cutoff" << start << " " << end << " " << this->g().length(e) * max_AT_percentage_);
+ DEBUG("start end cutoff" << start << " " << end << " " << max_AT_percentage_ * double(this->g().length(e)));
return true;
} else {
@@ -241,15 +241,17 @@ public:
+ this->g().IncomingEdgeCount(this->g().EdgeStart(e)) >= 1);
}
+ private:
+ DECL_LOGGER("DeadEndCondition");
+
};
template<class Graph>
pred::TypedPredicate<typename Graph::EdgeId>AddDeadEndCondition(const Graph& g,
pred::TypedPredicate<typename Graph::EdgeId> condition) {
- return pred::And<typename Graph::EdgeId>(DeadEndCondition<Graph>(g), condition);
+ return pred::And(DeadEndCondition<Graph>(g), condition);
}
-
//template<class Graph>
//bool ClipTips(
// Graph& g,
diff --git a/src/modules/assembly_graph/CMakeLists.txt b/src/modules/assembly_graph/CMakeLists.txt
index 5854450..41031ef 100644
--- a/src/modules/assembly_graph/CMakeLists.txt
+++ b/src/modules/assembly_graph/CMakeLists.txt
@@ -8,5 +8,5 @@
project(graph_support CXX)
add_library(graph_support STATIC
- components/connected_component.cpp paths/bidirectional_path.cpp graph_support/scaff_supplementary.cpp)
+ components/connected_component.cpp paths/bidirectional_path.cpp graph_support/scaff_supplementary.cpp graph_alignment/edge_index_refiller.cpp)
target_link_libraries(graph_support hattrie)
diff --git a/src/modules/assembly_graph/graph_alignment/edge_index.hpp b/src/modules/assembly_graph/graph_alignment/edge_index.hpp
index 72a9d25..187ea94 100644
--- a/src/modules/assembly_graph/graph_alignment/edge_index.hpp
+++ b/src/modules/assembly_graph/graph_alignment/edge_index.hpp
@@ -7,42 +7,36 @@
#pragma once
-#include "dev_support/openmp_wrapper.h"
-
#include "assembly_graph/graph_core/graph.hpp"
#include "assembly_graph/graph_core/action_handlers.hpp"
-#include "dev_support/standard_base.hpp"
-#include "data_structures/indices/edge_index_builders.hpp"
-
+#include "data_structures/indices/edge_position_index.hpp"
+#include "edge_index_refiller.hpp"
+
namespace debruijn_graph {
/**
* EdgeIndex is a structure to store info about location of certain k-mers in graph. It delegates all
* container procedures to inner_index_ and all handling procedures to
* renewer_ which is DataHashRenewer.
- * @see DeBruijnKMerIndex
- * @see DataHashRenewer
*/
-//fixme template params
-template<class Graph, class Seq /*= runtime_k::RtSeq*/,
- class Index /*= KmerFreeEdgeIndex<Graph, Seq>*/>
+template<class Graph>
class EdgeIndex: public omnigraph::GraphActionHandler<Graph> {
public:
typedef typename Graph::EdgeId EdgeId;
- typedef Index InnerIndexT;
+ using InnerIndex = KmerFreeEdgeIndex<Graph, runtime_k::RtSeq, kmer_index_traits<runtime_k::RtSeq>, DefaultStoring>;
typedef Graph GraphT;
- typedef typename Index::KMer KMer;
- typedef typename Index::KMerIdx KMerIdx;
- typedef typename Index::Value Value;
+ typedef typename InnerIndex::KMer KMer;
+ typedef typename InnerIndex::KMerIdx KMerIdx;
+ typedef typename InnerIndex::Value Value;
private:
- Index inner_index_;
- EdgeInfoUpdater<Index, Graph> updater_;
+ InnerIndex inner_index_;
+ EdgeInfoUpdater<InnerIndex, Graph> updater_;
+ EdgeIndexRefiller refiller_;
bool delete_index_;
public:
-
EdgeIndex(const Graph& g, const std::string &workdir)
: omnigraph::GraphActionHandler<Graph>(g, "EdgeIndex"),
inner_index_(g, workdir),
@@ -54,7 +48,7 @@ public:
TRACE("~EdgeIndex OK")
}
- Index &inner_index() {
+ InnerIndex &inner_index() {
return inner_index_;
}
@@ -62,16 +56,16 @@ public:
return inner_index_.k();
}
- const Index &inner_index() const {
+ const InnerIndex &inner_index() const {
VERIFY(this->IsAttached());
return inner_index_;
}
- virtual void HandleAdd(EdgeId e) {
+ void HandleAdd(EdgeId e) override {
updater_.UpdateKmers(e);
}
- virtual void HandleDelete(EdgeId e) {
+ void HandleDelete(EdgeId e) override {
updater_.DeleteKmers(e);
}
@@ -93,10 +87,7 @@ public:
void Refill() {
clear();
- typedef typename EdgeIndexHelper<InnerIndexT>::GraphPositionFillingIndexBuilderT IndexBuilder;
- //also makes an update!
- //todo pass appropriate 3-rd arg
- IndexBuilder().BuildIndexFromGraph(inner_index_, this->g());
+ refiller_.Refill(inner_index_, this->g());
INFO("Index refilled");
}
diff --git a/src/modules/assembly_graph/graph_alignment/edge_index_refiller.cpp b/src/modules/assembly_graph/graph_alignment/edge_index_refiller.cpp
new file mode 100644
index 0000000..d008b5a
--- /dev/null
+++ b/src/modules/assembly_graph/graph_alignment/edge_index_refiller.cpp
@@ -0,0 +1,33 @@
+//***************************************************************************
+//* Copyright (c) 2016 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "data_structures/indices/edge_index_builders.hpp"
+#include "data_structures/indices/edge_multi_index.hpp"
+#include "assembly_graph/graph_core/graph.hpp"
+
+#include "edge_index_refiller.hpp"
+
+namespace debruijn_graph {
+
+using EdgeIndex = KmerFreeEdgeIndex<ConjugateDeBruijnGraph, runtime_k::RtSeq, kmer_index_traits<runtime_k::RtSeq>>;
+
+template<>
+void EdgeIndexRefiller::Refill(EdgeIndex &index,
+ const ConjugateDeBruijnGraph &g) {
+ typedef typename EdgeIndexHelper<EdgeIndex>::GraphPositionFillingIndexBuilderT IndexBuilder;
+ IndexBuilder().BuildIndexFromGraph(index, g);
+}
+
+using PacIndex = DeBruijnEdgeMultiIndex<ConjugateDeBruijnGraph::EdgeId>;
+
+template<>
+void EdgeIndexRefiller::Refill(PacIndex &index,
+ const ConjugateDeBruijnGraph &g) {
+ typedef typename debruijn_graph::EdgeIndexHelper<PacIndex>::GraphPositionFillingIndexBuilderT Builder;
+ Builder().BuildIndexFromGraph(index, g);
+}
+
+}
diff --git a/src/modules/assembly_graph/graph_alignment/edge_index_refiller.hpp b/src/modules/assembly_graph/graph_alignment/edge_index_refiller.hpp
new file mode 100644
index 0000000..dc3f551
--- /dev/null
+++ b/src/modules/assembly_graph/graph_alignment/edge_index_refiller.hpp
@@ -0,0 +1,20 @@
+//***************************************************************************
+//* Copyright (c) 2016 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+namespace debruijn_graph {
+
+// The stuff is template here to provide interface w/o including any headers
+// In our case both EdgeIndex and Graph are very complex template objects we
+// do not want to pull the corresponding headers here until we untangle all
+// the mess
+struct EdgeIndexRefiller {
+ template<class EdgeIndex, class Graph>
+ void Refill(EdgeIndex &index, const Graph &g);
+};
+
+}
diff --git a/src/modules/assembly_graph/graph_alignment/kmer_mapper.hpp b/src/modules/assembly_graph/graph_alignment/kmer_mapper.hpp
index f905d2d..0f67d38 100644
--- a/src/modules/assembly_graph/graph_alignment/kmer_mapper.hpp
+++ b/src/modules/assembly_graph/graph_alignment/kmer_mapper.hpp
@@ -176,6 +176,11 @@ public:
return answer;
}
+ bool CanSubstitute(const Kmer &kmer) const {
+ const auto *rawval = mapping_.find(kmer);
+ return rawval != nullptr;
+ }
+
void BinWrite(std::ostream &file) const {
uint32_t sz = (uint32_t)size();
file.write((const char *) &sz, sizeof(uint32_t));
diff --git a/src/modules/assembly_graph/graph_alignment/pacbio/pac_index.hpp b/src/modules/assembly_graph/graph_alignment/pacbio/pac_index.hpp
index 155c560..0a1c55a 100644
--- a/src/modules/assembly_graph/graph_alignment/pacbio/pac_index.hpp
+++ b/src/modules/assembly_graph/graph_alignment/pacbio/pac_index.hpp
@@ -5,26 +5,23 @@
//* See file LICENSE for details.
//***************************************************************************
-/*
- * pac_index.hpp
- *
- * Created on: Jan 21, 2013
- * Author: lab42
- */
#pragma once
#include "data_structures/indices/edge_multi_index.hpp"
-#include "data_structures/indices/edge_index_builders.hpp"
-#include <algorithm>
+#include "assembly_graph/graph_alignment/edge_index_refiller.hpp"
+#include "assembly_graph/paths/mapping_path.hpp"
+#include "assembly_graph/paths/path_processor.hpp"
+// FIXME: Layering violation, get rid of this
+#include "pipeline/config_struct.hpp"
#include "pacbio_read_structures.hpp"
+#include "pipeline/config_struct.hpp"
-namespace pacbio {
-#define UNDEF_COLOR -1
-#define DELETED_COLOR -2
-
-template<class Graph>
-struct MappingDescription {
+#include <algorithm>
+namespace pacbio {
+enum {
+ UNDEF_COLOR = -1,
+ DELETED_COLOR = - 2
};
template<class Graph>
@@ -35,7 +32,7 @@ public:
typedef set<KmerCluster<Graph> > ClustersSet;
typedef typename Graph::VertexId VertexId;
typedef typename Graph::EdgeId EdgeId;
- typedef debruijn_graph::DeBruijnEdgeMultiIndex<typename Graph::EdgeId> Index;
+ typedef debruijn_graph::DeBruijnEdgeMultiIndex<typename Graph::EdgeId> Index;
typedef typename Index::KeyWithHash KeyWithHash;
private:
@@ -47,42 +44,36 @@ private:
const static int short_edge_cutoff = 0;
const static size_t min_cluster_size = 8;
const static int max_similarity_distance = 500;
+
+//Debug stasts
int good_follow = 0;
int half_bad_follow = 0;
int bad_follow = 0;
- double compression_cutoff;
- double domination_cutoff;
set<Sequence> banned_kmers;
debruijn_graph::DeBruijnEdgeMultiIndex<typename Graph::EdgeId> tmp_index;
map<pair<VertexId, VertexId>, vector<size_t> > distance_cashed;
size_t read_count;
bool ignore_map_to_middle;
-
+ debruijn_graph::config::debruijn_config::pacbio_processor pb_config_;
public:
MappingDescription Locate(const Sequence &s) const;
- PacBioMappingIndex(const Graph &g, size_t k, size_t debruijn_k_, bool ignore_map_to_middle)
+ PacBioMappingIndex(const Graph &g, size_t k, size_t debruijn_k_, bool ignore_map_to_middle, string out_dir, debruijn_graph::config::debruijn_config::pacbio_processor pb_config )
: g_(g),
pacbio_k(k),
debruijn_k(debruijn_k_),
- tmp_index((unsigned) pacbio_k, cfg::get().output_dir), ignore_map_to_middle(ignore_map_to_middle) {
+ tmp_index((unsigned) pacbio_k, out_dir), ignore_map_to_middle(ignore_map_to_middle), pb_config_(pb_config) {
DEBUG("PB Mapping Index construction started");
-
- typedef typename debruijn_graph::EdgeIndexHelper<debruijn_graph::DeBruijnEdgeMultiIndex<typename Graph::EdgeId>>::GraphPositionFillingIndexBuilderT Builder;
-
- Builder().BuildIndexFromGraph(tmp_index, g_);
+ debruijn_graph::EdgeIndexRefiller().Refill(tmp_index, g_);
INFO("Index constructed");
FillBannedKmers();
- compression_cutoff = cfg::get().pb.compression_cutoff; // 0.6
- domination_cutoff = cfg::get().pb.domination_cutoff; //1.5
- //INFO(tmp_index.size());
read_count = 0;
}
~PacBioMappingIndex(){
DEBUG("good/ugly/bad counts:" << good_follow << " "<<half_bad_follow << " " << bad_follow);
-
}
+
void FillBannedKmers() {
for (int i = 0; i < 4; i++) {
auto base = nucl((unsigned char) i);
@@ -109,8 +100,8 @@ public:
} else if (b.read_position == a.read_position) {
return (abs(int(b.edge_position) + shift - int(a.edge_position)) < 2);
} else {
- return ((b.edge_position + shift - a.edge_position >= (b.read_position - a.read_position) * compression_cutoff) &&
- ((b.edge_position + shift - a.edge_position) * compression_cutoff <= (b.read_position - a.read_position)));
+ return ((b.edge_position + shift - a.edge_position >= (b.read_position - a.read_position) * pb_config_.compression_cutoff) &&
+ ((b.edge_position + shift - a.edge_position) * pb_config_.compression_cutoff <= (b.read_position - a.read_position)));
}
}
@@ -292,7 +283,7 @@ public:
const KmerCluster<Graph> &b) const {
size_t a_size = a.size;
size_t b_size = b.size;
- if ((double) a_size < (double) b_size * domination_cutoff
+ if ((double) a_size < (double) b_size * pb_config_.domination_cutoff
|| a.sorted_positions[a.first_trustable_index].read_position
> b.sorted_positions[b.first_trustable_index].read_position
|| a.sorted_positions[a.last_trustable_index].read_position
@@ -349,9 +340,9 @@ public:
vector<EdgeId> intermediate_path = BestScoredPath(s, start_v, end_v, limits.first, limits.second, seq_start, seq_end, s_add, e_add);
if (intermediate_path.size() == 0) {
DEBUG("Tangled region between edgees "<< g_.int_id(prev_edge) << " " << g_.int_id(cur_edge) << " is not closed, additions from edges: " << int(g_.length(prev_edge)) - int(prev_last_index.edge_position) <<" " << int(cur_first_index.edge_position) - int(debruijn_k - pacbio_k ) << " and seq "<< - seq_start + seq_end);
- if (cfg::get().pb.additional_debug_info) {
+ if (pb_config_.additional_debug_info) {
DEBUG(" escpected gap length: " << -int(g_.length(prev_edge)) + int(prev_last_index.edge_position) - int(cur_first_index.edge_position) + int(debruijn_k - pacbio_k ) - seq_start + seq_end);
- PathStorageCallback<Graph> callback(g_);
+ omnigraph::PathStorageCallback<Graph> callback(g_);
ProcessPaths(g_, 0, 4000,
start_v, end_v,
callback);
@@ -591,8 +582,8 @@ public:
int seq_len = -start_pos + end_pos;
//int new_seq_len =
//TODO::something more reasonable
- int path_min_len = max(int(floor((seq_len - int(debruijn_k)) * cfg::get().pb.path_limit_pressing)), 0);
- int path_max_len = (int) ((double) (seq_len + (int) debruijn_k) * cfg::get().pb.path_limit_stretching);
+ int path_min_len = max(int(floor((seq_len - int(debruijn_k)) * pb_config_.path_limit_pressing)), 0);
+ int path_max_len = (int) ((double) (seq_len + (int) debruijn_k) * pb_config_.path_limit_stretching);
if (seq_len < 0) {
DEBUG("suspicious negative seq_len " << start_pos << " " << end_pos << " " << path_min_len << " " << path_max_len);
return std::make_pair(-1, -1);
@@ -621,7 +612,7 @@ public:
vector<size_t> result;
DEBUG("seq dist:" << s.size()/3);
if (distance_cashed.find(vertex_pair) == distance_cashed.end()) {
- DistancesLengthsCallback<Graph> callback(g_);
+ omnigraph::DistancesLengthsCallback<Graph> callback(g_);
ProcessPaths(g_, 0, s.size() / 3, start_v,
end_v, callback);
result = callback.distances();
@@ -678,7 +669,7 @@ public:
int start_pos, int end_pos, string &s_add,
string &e_add) {
DEBUG(" Traversing tangled region. Start and end vertices resp: " << g_.int_id(start_v) <<" " << g_.int_id(end_v));
- PathStorageCallback<Graph> callback(g_);
+ omnigraph::PathStorageCallback<Graph> callback(g_);
ProcessPaths(g_,
path_min_length, path_max_length,
start_v, end_v,
@@ -721,7 +712,7 @@ public:
}
// Short read alignment
- MappingPath<EdgeId> GetShortReadAlignment(const Sequence &s) const {
+ omnigraph::MappingPath<EdgeId> GetShortReadAlignment(const Sequence &s) const {
ClustersSet mapping_descr = GetOrderClusters(s);
map<EdgeId, KmerCluster<Graph> > largest_clusters;
@@ -745,25 +736,24 @@ public:
edge_cluster->second = *iter;
}
- }
- else {
+ } else {
largest_clusters.insert(make_pair(iter->edgeId, *iter));
}
}
- MappingPath<EdgeId> result;
+ omnigraph::MappingPath<EdgeId> result;
for (auto iter = largest_clusters.begin(); iter != largest_clusters.end(); ++iter) {
auto first_cluster = iter->second.sorted_positions[iter->second.first_trustable_index];
auto last_cluster = iter->second.sorted_positions[iter->second.last_trustable_index];
- MappingRange range(Range(first_cluster.read_position, last_cluster.read_position),
- Range(first_cluster.edge_position, last_cluster.edge_position));
- result.join(MappingPath<EdgeId>(vector<EdgeId>(1, iter->second.edgeId), vector<MappingRange>(1, range)));
+ omnigraph::MappingRange range(Range(first_cluster.read_position, last_cluster.read_position),
+ Range(first_cluster.edge_position, last_cluster.edge_position));
+ result.join({iter->second.edgeId, range});
}
return result;
}
- pair<EdgeId, size_t> GetUniqueKmerPos(const runtime_k::RtSeq& kmer) const {
+ std::pair<EdgeId, size_t> GetUniqueKmerPos(const runtime_k::RtSeq& kmer) const {
KeyWithHash kwh = tmp_index.ConstructKWH(kmer);
if (tmp_index.valid(kwh.key())) {
@@ -772,7 +762,7 @@ public:
return make_pair(keys[0].edge_id, keys[0].offset);
}
}
- return make_pair(EdgeId(0), -1u);
+ return std::make_pair(EdgeId(0), -1u);
}
diff --git a/src/modules/assembly_graph/graph_alignment/pacbio/pacbio_gap_closer.hpp b/src/modules/assembly_graph/graph_alignment/pacbio/pacbio_gap_closer.hpp
index b742c3d..2d3a0f0 100644
--- a/src/modules/assembly_graph/graph_alignment/pacbio/pacbio_gap_closer.hpp
+++ b/src/modules/assembly_graph/graph_alignment/pacbio/pacbio_gap_closer.hpp
@@ -13,6 +13,7 @@
#include "ConsensusCore/Poa/PoaConsensus.hpp"
#include <algorithm>
+#include <fstream>
namespace pacbio {
template<class Graph>
@@ -37,9 +38,10 @@ private:
public:
size_t min_gap_quantity;
- GapStorage(Graph &g, size_t min_gap_quantity)
+ size_t long_seq_limit_;
+ GapStorage(Graph &g, size_t min_gap_quantity, size_t long_seq_limit)
: g_(g),
- inner_index(), min_gap_quantity(min_gap_quantity){
+ inner_index(), min_gap_quantity(min_gap_quantity), long_seq_limit_(long_seq_limit){
}
size_t FillIndex() {
@@ -73,7 +75,7 @@ public:
HiddenAddGap(p);
if (add_rc) {
TRACE("Adding conjugate");
- HiddenAddGap(p.conjugate(g_, (int) cfg::get().K));
+ HiddenAddGap(p.conjugate(g_, (int) g_.k() ));
}
}
@@ -174,14 +176,13 @@ public:
int end_max = 0;
size_t long_seqs = 0;
size_t short_seqs = 0;
- size_t long_seq_limit = cfg::get().pb.long_seq_limit; //400
bool exclude_long_seqs = false;
for (auto j_iter = cl_start; j_iter != next_iter; j_iter++) {
if (g_.length(j_iter->start) - j_iter->edge_gap_start_position > 500 || j_iter->edge_gap_end_position > 500) {
DEBUG("ignoring alingment to the middle of edge");
continue;
}
- if (j_iter->gap_seq.size() > long_seq_limit)
+ if (j_iter->gap_seq.size() > long_seq_limit_)
long_seqs++;
else
short_seqs++;
@@ -199,7 +200,7 @@ public:
if (g_.length(j_iter->start) - j_iter->edge_gap_start_position > 500 || j_iter->edge_gap_end_position > 500)
continue;
- if (exclude_long_seqs && j_iter->gap_seq.size() > long_seq_limit)
+ if (exclude_long_seqs && j_iter->gap_seq.size() > long_seq_limit_)
continue;
string s = g_.EdgeNucls(j_iter->start).Subseq(start_min, j_iter->edge_gap_start_position).str();
@@ -237,6 +238,7 @@ private:
int not_unique_gaps;
int chained_gaps;
bool consensus_gap_closing;
+ size_t max_contigs_gap_length_;
public:
void CloseGapsInGraph(map<EdgeId, EdgeId> &replacement) {
for (auto iter = new_edges_.begin(); iter != new_edges_.end(); ++iter) {
@@ -313,7 +315,7 @@ private:
transform(s.begin(), s.end(), s.begin(), ::toupper);
gap_variants.push_back(s);
}
- if (consensus_gap_closing || (gap_variants.size() > 0 && gap_variants[0].length() < cfg::get().pb.max_contigs_gap_length)) {
+ if (consensus_gap_closing || (gap_variants.size() > 0 && gap_variants[0].length() < max_contigs_gap_length_)) {
map <EdgeId, pair<size_t, string>> tmp;
string tmp_string;
string s = g_.EdgeNucls(cl_start->start).Subseq(0, cl_start->edge_gap_start_position).str();
@@ -352,8 +354,8 @@ private:
}
public:
- PacbioGapCloser(Graph &g, bool consensus_gap )
- : g_(g), consensus_gap_closing(consensus_gap) {
+ PacbioGapCloser(Graph &g, bool consensus_gap, size_t max_contigs_gap_length )
+ : g_(g), consensus_gap_closing(consensus_gap), max_contigs_gap_length_(max_contigs_gap_length) {
closed_gaps = 0;
not_unique_gaps = 0;
chained_gaps = 0;
diff --git a/src/modules/assembly_graph/graph_alignment/pacbio/pacbio_read_structures.hpp b/src/modules/assembly_graph/graph_alignment/pacbio/pacbio_read_structures.hpp
index 38bd2e2..c2ce186 100644
--- a/src/modules/assembly_graph/graph_alignment/pacbio/pacbio_read_structures.hpp
+++ b/src/modules/assembly_graph/graph_alignment/pacbio/pacbio_read_structures.hpp
@@ -5,19 +5,13 @@
//* See file LICENSE for details.
//***************************************************************************
-/*
- * pac_index.hpp
- *
- * Created on: Jan 21, 2013
- * Author: lab42
- */
#pragma once
#include "data_structures/indices/perfect_hash_map.hpp"
-#include "pipeline/graph_pack.hpp"
#include <algorithm>
-using std::map;
-using std::set;
+#include <map>
+#include <set>
+
namespace pacbio {
template<class T>
struct pair_iterator_less {
diff --git a/src/modules/assembly_graph/graph_alignment/sequence_mapper.hpp b/src/modules/assembly_graph/graph_alignment/sequence_mapper.hpp
index cab3ebe..1334ced 100644
--- a/src/modules/assembly_graph/graph_alignment/sequence_mapper.hpp
+++ b/src/modules/assembly_graph/graph_alignment/sequence_mapper.hpp
@@ -255,27 +255,26 @@ class NewExtendedSequenceMapper: public SequenceMapper<Graph> {
const KmerSubs& kmer_mapper_;
size_t k_;
bool optimization_on_;
- // mutable size_t mapped_;
- // mutable size_t unmapped_;
bool FindKmer(const Kmer &kmer, size_t kmer_pos, std::vector<EdgeId> &passed,
RangeMappings& range_mappings) const {
std::pair<EdgeId, size_t> position = index_.get(kmer);
- if (position.second != -1u/*index contains this k-mer*/) {
- if (passed.empty() || passed.back() != position.first ||
- kmer_pos != range_mappings.back().initial_range.end_pos ||
- position.second + 1 < range_mappings.back().mapped_range.end_pos) {
+ if (position.second == -1u)
+ return false;
+
+ if (passed.empty() || passed.back() != position.first ||
+ kmer_pos != range_mappings.back().initial_range.end_pos ||
+ position.second + 1 < range_mappings.back().mapped_range.end_pos) {
passed.push_back(position.first);
- range_mappings.push_back(
- MappingRange(Range(kmer_pos, kmer_pos + 1),
- Range(position.second, position.second + 1)));
- } else {
+
+ range_mappings.push_back(MappingRange(Range(kmer_pos, kmer_pos + 1),
+ Range(position.second, position.second + 1)));
+ } else {
range_mappings.back().initial_range.end_pos = kmer_pos + 1;
range_mappings.back().mapped_range.end_pos = position.second + 1;
- }
- return true;
}
- return false;
+
+ return true;
}
bool TryThread(const Kmer& kmer, size_t kmer_pos, std::vector<EdgeId> &passed,
@@ -309,43 +308,23 @@ class NewExtendedSequenceMapper: public SequenceMapper<Graph> {
return false;
}
- bool Substitute(Kmer& kmer) const {
- Kmer subs = kmer_mapper_.Substitute(kmer);
- if (subs != kmer) {
- kmer = subs;
- return true;
- }
- return false;
- }
-
- bool ProcessKmer(Kmer kmer, size_t kmer_pos, std::vector<EdgeId> &passed_edges,
+ bool ProcessKmer(const Kmer &kmer, size_t kmer_pos, std::vector<EdgeId> &passed_edges,
RangeMappings& range_mapping, bool try_thread) const {
if (try_thread) {
- if (!TryThread(kmer, kmer_pos, passed_edges, range_mapping)) {
- Substitute(kmer);
- FindKmer(kmer, kmer_pos, passed_edges, range_mapping);
- return false;
- } else {
+ if (!TryThread(kmer, kmer_pos, passed_edges, range_mapping)) {
+ FindKmer(kmer_mapper_.Substitute(kmer), kmer_pos, passed_edges, range_mapping);
+ return false;
+ }
+
return true;
- }
- } else {
- if (!Substitute(kmer)) {
- return FindKmer(kmer, kmer_pos, passed_edges, range_mapping);
- } else {
- FindKmer(kmer, kmer_pos, passed_edges, range_mapping);
+ }
+
+ if (kmer_mapper_.CanSubstitute(kmer)) {
+ FindKmer(kmer_mapper_.Substitute(kmer), kmer_pos, passed_edges, range_mapping);
return false;
- }
}
- // if (!Substitute(kmer)) {
- // if (try_thread) {
- // return TryThread(kmer, kmer_pos, passed_edges, range_mapping);
- // } else {
- // return FindKmer(kmer, kmer_pos, passed_edges, range_mapping);
- // }
- // } else {
- // FindKmer(kmer, kmer_pos, passed_edges, range_mapping);
- // return false;
- // }
+
+ return FindKmer(kmer, kmer_pos, passed_edges, range_mapping);
}
public:
diff --git a/src/modules/assembly_graph/graph_alignment/sequence_mapper_notifier.hpp b/src/modules/assembly_graph/graph_alignment/sequence_mapper_notifier.hpp
index ed7c41f..d5af6f9 100644
--- a/src/modules/assembly_graph/graph_alignment/sequence_mapper_notifier.hpp
+++ b/src/modules/assembly_graph/graph_alignment/sequence_mapper_notifier.hpp
@@ -8,6 +8,7 @@
#ifndef SEQUENCE_MAPPER_NOTIFIER_HPP_
#define SEQUENCE_MAPPER_NOTIFIER_HPP_
+#include "dev_support/memory_limit.hpp"
#include "assembly_graph/graph_alignment/sequence_mapper.hpp"
#include "short_read_mapper.hpp"
#include "io/reads/paired_read.hpp"
@@ -87,6 +88,8 @@ public:
++size;
NotifyProcessRead(r, mapper, lib_index, ithread);
}
+# pragma omp atomic
+ counter += size;
}
INFO("Total " << counter << " reads processed");
NotifyStopProcessLibrary(lib_index);
diff --git a/src/modules/assembly_graph/graph_alignment/short_read_mapper.hpp b/src/modules/assembly_graph/graph_alignment/short_read_mapper.hpp
index 2202400..b17559a 100644
--- a/src/modules/assembly_graph/graph_alignment/short_read_mapper.hpp
+++ b/src/modules/assembly_graph/graph_alignment/short_read_mapper.hpp
@@ -40,7 +40,7 @@ public:
{
if (indices_.find(small_k_) == indices_.end()) {
indices_.insert(make_pair(small_k_,
- new pacbio::PacBioMappingIndex<Graph>(g, small_k_, graph_k, false)));
+ new pacbio::PacBioMappingIndex<Graph>(g, small_k_, graph_k, false, cfg::get().output_dir, cfg::get().pb)));
}
index_ = indices_[small_k_];
++active_mappers_;
diff --git a/src/modules/assembly_graph/graph_support/basic_edge_conditions.hpp b/src/modules/assembly_graph/graph_support/basic_edge_conditions.hpp
index 68e3050..f0b72a0 100644
--- a/src/modules/assembly_graph/graph_support/basic_edge_conditions.hpp
+++ b/src/modules/assembly_graph/graph_support/basic_edge_conditions.hpp
@@ -86,6 +86,7 @@ pred::TypedPredicate<typename Graph::EdgeId> AddAlternativesPresenceCondition(co
return pred::And(AlternativesPresenceCondition<Graph>(g), condition);
}
+
template<class Graph>
class CoverageUpperBound : public EdgeCondition<Graph> {
typedef typename Graph::EdgeId EdgeId;
diff --git a/src/modules/assembly_graph/graph_support/contig_output.hpp b/src/modules/assembly_graph/graph_support/contig_output.hpp
index a67166f..26e9dda 100644
--- a/src/modules/assembly_graph/graph_support/contig_output.hpp
+++ b/src/modules/assembly_graph/graph_support/contig_output.hpp
@@ -203,7 +203,7 @@ struct ExtendedContigIdT {
template <class Graph>
void MakeContigIdMap(const Graph& graph, map<EdgeId, ExtendedContigIdT>& ids, const ConnectedComponentCounter &cc_counter_, string prefix) {
int counter = 0;
- for (auto it = graph.ConstEdgeBegin(); !it.IsEnd(); ++it) {
+ for (auto it = graph.ConstEdgeBegin(true); !it.IsEnd(); ++it) {
EdgeId e = *it;
if (ids.count(e) == 0) {
string id;
@@ -256,7 +256,7 @@ public:
void PrintContigsFASTG(sequence_stream &os, const ConnectedComponentCounter & cc_counter) {
map<EdgeId, ExtendedContigIdT> ids;
MakeContigIdMap(graph_, ids, cc_counter, "EDGE");
- for (auto it = graph_.SmartEdgeBegin(); !it.IsEnd(); ++it) {
+ for (auto it = graph_.ConstEdgeBegin(true); !it.IsEnd(); ++it) {
set<string> next;
VertexId v = graph_.EdgeEnd(*it);
auto edges = graph_.OutgoingEdges(v);
@@ -264,8 +264,16 @@ public:
next.insert(ids[*next_it].full_id_);
}
ReportEdge(os, constructor_.construct(*it).first, ids[*it].full_id_, next);
- //FASTG always needs both sets of edges
- //it.HandleDelete(graph_.conjugate(*it));
+ if (*it != graph_.conjugate(*it))
+ {
+ set<string> next_conj;
+ v = graph_.EdgeEnd(graph_.conjugate(*it));
+ edges = graph_.OutgoingEdges(v);
+ for (auto next_it = edges.begin(); next_it != edges.end(); ++next_it) {
+ next_conj.insert(ids[*next_it].full_id_);
+ }
+ ReportEdge(os, constructor_.construct(graph_.conjugate(*it)).first, ids[graph_.conjugate(*it)].full_id_, next_conj);
+ }
}
}
};
@@ -297,11 +305,7 @@ void ReportEdge(io::osequencestream_cov& oss
}
}
-inline void OutputContigs(ConjugateDeBruijnGraph& g,
- const string& contigs_output_filename,
- bool output_unipath,
- size_t ,
- bool /*cut_bad_connections*/) {
+inline void OutputContigs(ConjugateDeBruijnGraph &g, const string &contigs_output_filename, bool output_unipath) {
INFO("Outputting contigs to " << contigs_output_filename << ".fasta");
DefaultContigCorrector<ConjugateDeBruijnGraph> corrector(g);
io::osequencestream_cov oss(contigs_output_filename + ".fasta");
diff --git a/src/modules/assembly_graph/graph_support/genomic_quality.hpp b/src/modules/assembly_graph/graph_support/genomic_quality.hpp
index cdf6e12..ee9e75a 100644
--- a/src/modules/assembly_graph/graph_support/genomic_quality.hpp
+++ b/src/modules/assembly_graph/graph_support/genomic_quality.hpp
@@ -133,8 +133,8 @@ class QualityLoggingRemovalHandler {
}
public:
- QualityLoggingRemovalHandler(const Graph& g, const EdgeQuality<Graph>& quality_handler,
- bool handle_all = false) :
+ QualityLoggingRemovalHandler(const Graph& g, const EdgeQuality<Graph>& quality_handler,
+ bool handle_all = false) :
g_(g), quality_handler_(quality_handler), black_removed_(0), total_(0), handle_all_(handle_all) {
}
@@ -183,7 +183,7 @@ public:
, const string& output_folder, bool handle_all = false) :
base(g, quality_handler, handle_all),
printing_rh_(g, labeler, colorer, output_folder)
- {}
+ {}
virtual void HandlePositiveQuality(EdgeId e) {
printing_rh_.HandleDelete(e, "_" + ToString(this->quality_handler().quality(e)));
diff --git a/src/modules/assembly_graph/handlers/edges_position_handler.hpp b/src/modules/assembly_graph/handlers/edges_position_handler.hpp
index d3aefdf..aaa9af0 100644
--- a/src/modules/assembly_graph/handlers/edges_position_handler.hpp
+++ b/src/modules/assembly_graph/handlers/edges_position_handler.hpp
@@ -135,8 +135,14 @@ public:
VERIFY(this->IsAttached());
std::stringstream ss;
vector<EdgePosition> positions = GetEdgePositions(edge);
+ size_t counter = 0;
for (auto pos_it = positions.begin(), end = positions.end(); pos_it != end; ++pos_it) {
ss << "(" << pos_it->contigId << ": " << pos_it->mr << ")\\n";
+ counter++;
+ if(counter > 30) {
+ ss << "and many more. Totally " << positions.size() << " positions.";
+ break;
+ }
}
return ss.str();
}
diff --git a/src/modules/assembly_graph/paths/mapping_path.hpp b/src/modules/assembly_graph/paths/mapping_path.hpp
index d6cba65..2cb6076 100644
--- a/src/modules/assembly_graph/paths/mapping_path.hpp
+++ b/src/modules/assembly_graph/paths/mapping_path.hpp
@@ -157,6 +157,11 @@ class MappingPath {
public:
MappingPath() {}
+ MappingPath(const ElementId &edge,
+ const MappingRange &range_mapping)
+ : edges_({ edge }),
+ range_mappings_({ range_mapping }) {}
+
MappingPath(const std::vector<ElementId>& edges,
const std::vector<MappingRange> range_mappings)
: edges_(edges),
diff --git a/src/modules/assembly_graph/stats/picture_dump.hpp b/src/modules/assembly_graph/stats/picture_dump.hpp
index cfaa696..18c6d39 100644
--- a/src/modules/assembly_graph/stats/picture_dump.hpp
+++ b/src/modules/assembly_graph/stats/picture_dump.hpp
@@ -276,6 +276,27 @@ struct detail_info_printer {
<< stats.edges() << ", sum length of edges : " << stats.edge_length());
}
+ if (config.save_graph_pack) {
+ string saves_folder = path::append_path(path::append_path(folder_, "saves/"),
+ ToString(call_cnt++, 2) + "_" + pos_name + "/");
+ path::make_dirs(saves_folder);
+ graphio::ConjugateDataPrinter<conj_graph_pack::graph_t> printer(gp_.g);
+ graphio::PrintGraphPack(saves_folder + "graph_pack", printer, gp_);
+ //TODO: separate
+ graphio::PrintClusteredIndices(saves_folder + "graph_pack", printer, gp_.clustered_indices);
+ }
+
+ if (config.save_all) {
+ string saves_folder = path::append_path(path::append_path(folder_, "saves/"),
+ ToString(call_cnt++, 2) + "_" + pos_name);
+ path::make_dirs(saves_folder);
+ string p = saves_folder + "/saves";
+ INFO("Saving current state to " << p);
+
+ debruijn_graph::graphio::PrintAll(p, gp_);
+ debruijn_graph::config::write_lib_data(p);
+ }
+
if (config.save_full_graph) {
string saves_folder = path::append_path(path::append_path(folder_, "saves/"),
ToString(call_cnt++, 2) + "_" + pos_name + "/");
diff --git a/src/modules/data_structures/debruijn_graph/debruijn_graph_constructor.hpp b/src/modules/data_structures/debruijn_graph/debruijn_graph_constructor.hpp
index d6a3545..7a293f5 100644
--- a/src/modules/data_structures/debruijn_graph/debruijn_graph_constructor.hpp
+++ b/src/modules/data_structures/debruijn_graph/debruijn_graph_constructor.hpp
@@ -6,13 +6,6 @@
//* See file LICENSE for details.
//***************************************************************************
-/*
- * debruijn_graph_constructor.hpp
- *
- * Created on: Apr 5, 2011
- * Author: sergey
- */
-
#include "assembly_graph/graph_core/graph.hpp"
#include "assembly_graph/graph_core/construction_helper.hpp"
#include "dev_support/standard_base.hpp"
diff --git a/src/modules/data_structures/debruijn_graph/early_simplification.hpp b/src/modules/data_structures/debruijn_graph/early_simplification.hpp
index ccc89e6..3fc9d55 100644
--- a/src/modules/data_structures/debruijn_graph/early_simplification.hpp
+++ b/src/modules/data_structures/debruijn_graph/early_simplification.hpp
@@ -43,7 +43,7 @@ public:
//TODO make parallel
void CleanLinks() {
- vector<Index::kmer_iterator> iters = index_.kmer_begin(10 * cfg::get().max_threads);
+ vector<Index::kmer_iterator> iters = index_.kmer_begin(10 * omp_get_max_threads());
# pragma omp parallel for schedule(guided)
for(size_t i = 0; i < iters.size(); i++) {
for (Index::kmer_iterator &it = iters[i]; it.good(); ++it) {
@@ -60,83 +60,6 @@ public:
}
};
-
-class EarlyTipClipper {
-private:
- typedef DeBruijnExtensionIndex<> Index;
- typedef Index::KMer Kmer;
- typedef Index::KeyWithHash KeyWithHash;
- Index &index_;
- size_t length_bound_;
-
-//Not optimal with respect to the number of large array queries (the one that contains adjacency masks). Should be ok though in case cash works the way I think it does
- size_t RemoveForward(KeyWithHash kh) {
- std::vector<KeyWithHash> tip;
- do {
- tip.push_back(kh);
- kh = index_.GetUniqueOutgoing(kh);
- } while (tip.size() < length_bound_ && index_.CheckUniqueIncoming(kh) && index_.CheckUniqueOutgoing(kh));
-
- if (!index_.CheckUniqueIncoming(kh)) {
- for (size_t i = 0; i < tip.size(); i++) {
- index_.IsolateVertex(tip[i]);
- }
- return tip.size();
- }
-
- return 0;
- }
-
- size_t RemoveBackward(KeyWithHash kh) {
- std::vector<KeyWithHash> tip;
- do {
- tip.push_back(kh);
- kh = index_.GetUniqueIncoming(kh);
- } while(tip.size() < length_bound_ && index_.CheckUniqueIncoming(kh) && index_.CheckUniqueOutgoing(kh));
-
- if (!index_.CheckUniqueOutgoing(kh)) {
- for (size_t i = 0; i < tip.size(); i++) {
- index_.IsolateVertex(tip[i]);
- }
- return tip.size();
- }
- return 0;
- }
-
- //TODO make parallel
- size_t RoughClipTips() {
- size_t result = 0;
- for (auto it = index_.kmer_begin(); it.good(); ++it) {
- KeyWithHash kh = index_.ConstructKWH(runtime_k::RtSeq(index_.k(), *it));
- if (index_.IsDeadEnd(kh) && index_.CheckUniqueIncoming(kh)) {
- result += RemoveBackward(kh);
- } else if(index_.IsDeadStart(kh) && index_.CheckUniqueOutgoing(kh)) {
- result += RemoveForward(kh);
- }
- }
- return result;
- }
-
-
-public:
- EarlyTipClipper(Index &index, size_t length_bound) :
- index_(index), length_bound_(length_bound) {}
-
- /*
- * Method returns the number of removed edges
- */
- size_t ClipTips() {
- INFO("Early tip clipping");
- size_t result = RoughClipTips();
- LinkCleaner(index_).CleanLinks();
- INFO(result << " " << (index_.k()+1) <<"-mers were removed by early tip clipper");
- return result;
- }
-protected:
- DECL_LOGGER("Early tip clipping");
-};
-
-
class AlternativeEarlyTipClipper {
private:
typedef DeBruijnExtensionIndex<> Index;
@@ -225,7 +148,7 @@ private:
//TODO make parallel
size_t RoughClipTips() {
- vector<Index::kmer_iterator> iters = index_.kmer_begin(10 * cfg::get().max_threads);
+ vector<Index::kmer_iterator> iters = index_.kmer_begin(10 * omp_get_max_threads());
vector<size_t> result(iters.size());
# pragma omp parallel for schedule(guided)
for(size_t i = 0; i < iters.size(); i++) {
diff --git a/src/modules/data_structures/indices/edge_index_builders.hpp b/src/modules/data_structures/indices/edge_index_builders.hpp
index 6e20297..5281bbc 100644
--- a/src/modules/data_structures/indices/edge_index_builders.hpp
+++ b/src/modules/data_structures/indices/edge_index_builders.hpp
@@ -8,25 +8,24 @@
#pragma once
#include "edge_position_index.hpp"
+#include "perfect_hash_map_builder.hpp"
namespace debruijn_graph {
-template <class Builder>
-class GraphPositionFillingIndexBuilder : public Builder {
- typedef Builder base;
+template<class Index>
+class GraphPositionFillingIndexBuilder {
public:
- typedef typename Builder::IndexT IndexT;
- typedef typename IndexT::KMer Kmer;
-// typedef typename IndexT::GraphT GraphT;
+ typedef Index IndexT;
+ typedef typename Index::KMer Kmer;
template<class Graph>
- void BuildIndexFromGraph(IndexT &index,
+ void BuildIndexFromGraph(Index &index,
const Graph/*T*/ &g, size_t read_buffer_size = 0) const {
- base::BuildIndexFromGraph(index, g, read_buffer_size);
+ debruijn_graph::BuildIndexFromGraph(index, g, read_buffer_size);
// Now use the index to fill the coverage and EdgeId's
INFO("Collecting k-mer coverage information from graph, this takes a while.");
- EdgeInfoUpdater<IndexT, Graph> updater(g, index);
+ EdgeInfoUpdater<Index, Graph> updater(g, index);
updater.UpdateAll();
}
@@ -146,7 +145,7 @@ class CoverageFillingEdgeIndexBuilder : public Builder {
size_t BuildIndexFromStream(IndexT &index,
Streams &streams,
io::SingleStream* contigs_stream = 0) const {
- base::BuildIndexFromStream(index, streams, contigs_stream);
+ debruijn_graph::BuildIndexFromStream(index, streams, contigs_stream);
return ParallelFillCoverage(index, streams, false);
}
@@ -164,15 +163,11 @@ class CoverageFillingEdgeIndexBuilder : public Builder {
template<class Index>
struct EdgeIndexHelper {
- typedef Index IndexT;
- typedef typename IndexT::KMer Kmer;
- typedef typename IndexT::KMerIdx KMerIdx;
- typedef typename IndexT::traits_t traits_t;
-// typedef typename IndexT::IdType IdType;
- typedef DeBruijnStreamKMerIndexBuilder<Kmer, IndexT> DeBruijnStreamKMerIndexBuilderT;
- typedef CoverageFillingEdgeIndexBuilder<DeBruijnStreamKMerIndexBuilderT> CoverageFillingEdgeIndexBuilderT;
- typedef DeBruijnGraphKMerIndexBuilder<IndexT> DeBruijnGraphKMerIndexBuilderT;
- typedef GraphPositionFillingIndexBuilder<DeBruijnGraphKMerIndexBuilderT> GraphPositionFillingIndexBuilderT;
+ typedef typename Index::KMer Kmer;
+ typedef typename Index::KMerIdx KMerIdx;
+ typedef typename Index::traits_t traits_t;
+ typedef CoverageFillingEdgeIndexBuilder<Index> CoverageFillingEdgeIndexBuilderT;
+ typedef GraphPositionFillingIndexBuilder<Index> GraphPositionFillingIndexBuilderT;
typedef CoverageFillingEdgeIndexBuilder<GraphPositionFillingIndexBuilderT> CoverageAndGraphPositionFillingIndexBuilderT;
};
diff --git a/src/modules/data_structures/indices/edge_info_updater.hpp b/src/modules/data_structures/indices/edge_info_updater.hpp
index 6c53b79..ce957f6 100644
--- a/src/modules/data_structures/indices/edge_info_updater.hpp
+++ b/src/modules/data_structures/indices/edge_info_updater.hpp
@@ -9,6 +9,7 @@
#include "dev_support/standard_base.hpp"
#include "dev_support/openmp_wrapper.h"
+#include "modules/data_structures/sequence/sequence.hpp"
#include "modules/assembly_graph/graph_core/graph_iterators.hpp"
namespace debruijn_graph {
diff --git a/src/modules/data_structures/indices/edge_multi_index.hpp b/src/modules/data_structures/indices/edge_multi_index.hpp
index e1e7e52..c514e55 100644
--- a/src/modules/data_structures/indices/edge_multi_index.hpp
+++ b/src/modules/data_structures/indices/edge_multi_index.hpp
@@ -6,15 +6,9 @@
//***************************************************************************
#pragma once
-/*
- * edge_multi_index.hpp
- *
- * Created on: May 24, 2013
- * Author: anton
- */
+
#include "perfect_hash_map.hpp"
#include "edge_info_updater.hpp"
-#include "kmer_splitters.hpp"
#include "edge_position_index.hpp"
#include <folly/SmallLocks.h>
diff --git a/src/modules/data_structures/indices/edge_position_index.hpp b/src/modules/data_structures/indices/edge_position_index.hpp
index 6652e48..76f3502 100644
--- a/src/modules/data_structures/indices/edge_position_index.hpp
+++ b/src/modules/data_structures/indices/edge_position_index.hpp
@@ -6,16 +6,11 @@
//***************************************************************************
#pragma once
-/*
- * edge_index.hpp
- *
- * Created on: May 24, 2013
- * Author: anton
- */
#include "perfect_hash_map.hpp"
#include "edge_info_updater.hpp"
-#include "kmer_splitters.hpp"
+#include "data_structures/sequence/runtime_k.hpp"
+#include "modules/io/reads/single_read.hpp"
namespace debruijn_graph {
@@ -82,25 +77,23 @@ public:
*/
bool contains(const KeyWithHash &kwh) const {
// Sanity check
- if (!valid(kwh)) {
+ if (!valid(kwh))
return false;
- }
Value entry = base::get_value(kwh);
-
- if (entry.offset == -1u) {
+ if (entry.offset == -1u)
return false;
- }
- return kwh.key() == KMer(this->k(), graph_.EdgeNucls(entry.edge_id), entry.offset);
+ return graph_.EdgeNucls(entry.edge_id).contains(kwh.key(), entry.offset);
}
void PutInIndex(KeyWithHash &kwh, IdType id, size_t offset) {
- if (valid(kwh)) {
- auto &entry = this->get_raw_value_reference(kwh);
- if (!entry.valid() || contains(kwh)) {
- this->put_value(kwh, Value(id, (unsigned)offset, entry.count));
- }
+ if (!valid(kwh))
+ return;
+
+ auto &entry = this->get_raw_value_reference(kwh);
+ if (!entry.valid() || contains(kwh)) {
+ this->put_value(kwh, Value(id, (unsigned)offset, entry.count));
}
}
diff --git a/src/modules/data_structures/indices/key_with_hash.hpp b/src/modules/data_structures/indices/key_with_hash.hpp
index 06cf673..81026ae 100644
--- a/src/modules/data_structures/indices/key_with_hash.hpp
+++ b/src/modules/data_structures/indices/key_with_hash.hpp
@@ -6,12 +6,7 @@
//***************************************************************************
#pragma once
-/*
- * key_with_hash.hpp
- *
- * Created on: Nov 7, 2013
- * Author: anton
- */
+
#include "storing_traits.hpp"
namespace debruijn_graph {
@@ -112,8 +107,8 @@ private:
const HashFunction &hash_;
Key key_;
- mutable bool is_minimal_;
mutable IdxType idx_; //lazy computation
+ mutable bool is_minimal_;
mutable bool ready_;
void CountIdx() const {
@@ -126,29 +121,24 @@ private:
}
}
- void SetKey(const Key &key) {
- ready_ = false;
- key_ = key;
- }
-
InvertableKeyWithHash(Key key, const HashFunction &hash, bool is_minimal,
- size_t idx, bool ready) :
- hash_(hash), key_(key), is_minimal_(is_minimal), idx_(idx), ready_(
- ready) {
+ size_t idx, bool ready)
+ : hash_(hash), key_(key), idx_(idx),
+ is_minimal_(is_minimal), ready_(ready) {
}
-public:
+ public:
- InvertableKeyWithHash(Key key, const HashFunction &hash) : hash_(hash), key_(key), is_minimal_(false), idx_(0), ready_(false) {
- }
+ InvertableKeyWithHash(Key key, const HashFunction &hash)
+ : hash_(hash), key_(key), idx_(0), is_minimal_(false), ready_(false) {}
- Key key() const {
+ const Key &key() const {
return key_;
}
IdxType idx() const {
- if(!ready_) {
+ if (!ready_)
CountIdx();
- }
+
return idx_;
}
@@ -197,11 +187,13 @@ public:
}
void operator<<=(char nucl) {
- SetKey(key_ << nucl);
+ key_ <<= nucl;
+ ready_ = false;
}
void operator>>=(char nucl) {
- SetKey(key_ >> nucl);
+ key_ >>= nucl;
+ ready_ = false;
}
char operator[](size_t i) const {
diff --git a/src/modules/data_structures/indices/kmer_extension_index.hpp b/src/modules/data_structures/indices/kmer_extension_index.hpp
index 6d1d5df..9e7cc55 100644
--- a/src/modules/data_structures/indices/kmer_extension_index.hpp
+++ b/src/modules/data_structures/indices/kmer_extension_index.hpp
@@ -6,14 +6,8 @@
//***************************************************************************
#pragma once
-/*
- * kmer_extension_index.hpp
- *
- * Created on: May 24, 2013
- * Author: anton
- */
+
#include "perfect_hash_map.hpp"
-#include "kmer_splitters.hpp"
#include "dev_support/simple_tools.hpp"
#include "storing_traits.hpp"
#include <bitset>
@@ -32,7 +26,7 @@ inline uint8_t invert_byte_slow(uint8_t a) {
inline vector<uint8_t> count_invert_byte() {
vector<uint8_t> result;
- for(size_t a = 0; a < 256; a++) {
+ for (size_t a = 0; a < 256; a++) {
result.push_back(invert_byte_slow((uint8_t)a));
}
return result;
@@ -312,102 +306,4 @@ private:
DECL_LOGGER("ExtentionIndex");
};
-template<class Builder>
-class DeBruijnExtensionIndexBuilder : public Builder {
- typedef Builder base;
-public:
- typedef typename Builder::IndexT IndexT;
-
- template<class ReadStream>
- size_t FillExtensionsFromStream(ReadStream &stream, IndexT &index) const {
- unsigned k = index.k();
- size_t rl = 0;
-
- while (!stream.eof()) {
- typename ReadStream::read_type r;
- stream >> r;
- rl = std::max(rl, r.size());
-
- const Sequence &seq = r.sequence();
- if (seq.size() < k + 1)
- continue;
-
- typename IndexT::KeyWithHash kwh = index.ConstructKWH(seq.start<runtime_k::RtSeq>(k));
- for (size_t j = k; j < seq.size(); ++j) {
- char nnucl = seq[j], pnucl = kwh[0];
- index.AddOutgoing(kwh, nnucl);
- kwh <<= nnucl;
- index.AddIncoming(kwh, pnucl);
- }
- }
-
- return rl;
- }
-
- void FillExtensionsFromIndex(const std::string &KPlusOneMersFilename,
- IndexT &index) const {
- unsigned KPlusOne = index.k() + 1;
-
- typename IndexT::kmer_iterator it(
- KPlusOneMersFilename, runtime_k::RtSeq::GetDataSize(KPlusOne));
- for (; it.good(); ++it) {
- runtime_k::RtSeq kpomer(KPlusOne, *it);
-
- char pnucl = kpomer[0], nnucl = kpomer[KPlusOne - 1];
- TRACE("processing k+1-mer " << kpomer);
- index.AddOutgoing(index.ConstructKWH(runtime_k::RtSeq(KPlusOne - 1, kpomer)),
- nnucl);
- // FIXME: This is extremely ugly. Needs to add start / end methods to extract first / last N symbols...
- index.AddIncoming(index.ConstructKWH(runtime_k::RtSeq(KPlusOne - 1, kpomer << 0)),
- pnucl);
- }
- }
-
-public:
- template<class Streams>
- ReadStatistics BuildExtensionIndexFromStream(
- IndexT &index, Streams &streams, io::SingleStream* contigs_stream = 0,
- size_t read_buffer_size = 0) const {
- unsigned nthreads = (unsigned) streams.size();
-
- // First, build a k+1-mer index
- DeBruijnReadKMerSplitter<typename Streams::ReadT, StoringTypeFilter<typename IndexT::storing_type>> splitter(
- index.workdir(), index.k() + 1, 0xDEADBEEF, streams,
- contigs_stream, read_buffer_size);
- KMerDiskCounter<runtime_k::RtSeq> counter(index.workdir(), splitter);
- counter.CountAll(nthreads, nthreads, /* merge */false);
-
- // Now, count unique k-mers from k+1-mers
- DeBruijnKMerKMerSplitter<StoringTypeFilter<typename IndexT::storing_type> > splitter2(index.workdir(), index.k(),
- index.k() + 1, IndexT::storing_type::IsInvertable(), read_buffer_size);
- for (unsigned i = 0; i < nthreads; ++i)
- splitter2.AddKMers(counter.GetMergedKMersFname(i));
- KMerDiskCounter<runtime_k::RtSeq> counter2(index.workdir(), splitter2);
-
- index.BuildIndex(counter2, 16, nthreads);
-
- // Build the kmer extensions
- INFO("Building k-mer extensions from k+1-mers");
-# pragma omp parallel for num_threads(nthreads)
- for (unsigned i = 0; i < nthreads; ++i)
- FillExtensionsFromIndex(counter.GetMergedKMersFname(i), index);
- INFO("Building k-mer extensions from k+1-mers finished.");
-
- return splitter.stats();
- }
-
-private:
- DECL_LOGGER("DeBruijnExtensionIndexBuilder");
-};
-
-template<class Index>
-struct ExtensionIndexHelper {
- typedef Index IndexT;
- typedef typename IndexT::traits_t traits_t;
- typedef typename IndexT::KMer Kmer;
- typedef typename IndexT::KMerIdx KMerIdx;
- typedef DeBruijnStreamKMerIndexBuilder<Kmer, IndexT> DeBruijnStreamKMerIndexBuilderT;
- typedef DeBruijnExtensionIndexBuilder<DeBruijnStreamKMerIndexBuilderT> DeBruijnExtensionIndexBuilderT;
-};
-
}
diff --git a/src/modules/data_structures/indices/kmer_extension_index_builder.hpp b/src/modules/data_structures/indices/kmer_extension_index_builder.hpp
new file mode 100644
index 0000000..6f4f9fc
--- /dev/null
+++ b/src/modules/data_structures/indices/kmer_extension_index_builder.hpp
@@ -0,0 +1,106 @@
+//***************************************************************************
+//* Copyright (c) 2016 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "kmer_extension_index.hpp"
+#include "kmer_splitters.hpp"
+
+class DeBruijnExtensionIndexBuilder {
+public:
+ template<class ReadStream, class Index>
+ size_t FillExtensionsFromStream(ReadStream &stream, Index &index) const {
+ unsigned k = index.k();
+ size_t rl = 0;
+
+ while (!stream.eof()) {
+ typename ReadStream::read_type r;
+ stream >> r;
+ rl = std::max(rl, r.size());
+
+ const Sequence &seq = r.sequence();
+ if (seq.size() < k + 1)
+ continue;
+
+ typename Index::KeyWithHash kwh = index.ConstructKWH(seq.start<runtime_k::RtSeq>(k));
+ for (size_t j = k; j < seq.size(); ++j) {
+ char nnucl = seq[j], pnucl = kwh[0];
+ index.AddOutgoing(kwh, nnucl);
+ kwh <<= nnucl;
+ index.AddIncoming(kwh, pnucl);
+ }
+ }
+
+ return rl;
+ }
+
+ template<class Index>
+ void FillExtensionsFromIndex(const std::string &KPlusOneMersFilename,
+ Index &index) const {
+ unsigned KPlusOne = index.k() + 1;
+
+ typename Index::kmer_iterator it(KPlusOneMersFilename,
+ runtime_k::RtSeq::GetDataSize(KPlusOne));
+ for (; it.good(); ++it) {
+ runtime_k::RtSeq kpomer(KPlusOne, *it);
+
+ char pnucl = kpomer[0], nnucl = kpomer[KPlusOne - 1];
+ TRACE("processing k+1-mer " << kpomer);
+ index.AddOutgoing(index.ConstructKWH(runtime_k::RtSeq(KPlusOne - 1, kpomer)),
+ nnucl);
+ // FIXME: This is extremely ugly. Needs to add start / end methods to extract first / last N symbols...
+ index.AddIncoming(index.ConstructKWH(runtime_k::RtSeq(KPlusOne - 1, kpomer << 0)),
+ pnucl);
+ }
+ }
+
+public:
+ template<class Index, class Streams>
+ ReadStatistics BuildExtensionIndexFromStream(Index &index, Streams &streams, io::SingleStream* contigs_stream = 0,
+ size_t read_buffer_size = 0) const {
+ unsigned nthreads = (unsigned) streams.size();
+
+ // First, build a k+1-mer index
+ DeBruijnReadKMerSplitter<typename Streams::ReadT,
+ StoringTypeFilter<typename Index::storing_type>>
+ splitter(index.workdir(), index.k() + 1, 0xDEADBEEF, streams,
+ contigs_stream, read_buffer_size);
+ KMerDiskCounter<runtime_k::RtSeq> counter(index.workdir(), splitter);
+ counter.CountAll(nthreads, nthreads, /* merge */false);
+
+ // Now, count unique k-mers from k+1-mers
+ DeBruijnKMerKMerSplitter<StoringTypeFilter<typename Index::storing_type> >
+ splitter2(index.workdir(), index.k(),
+ index.k() + 1, Index::storing_type::IsInvertable(), read_buffer_size);
+ for (unsigned i = 0; i < nthreads; ++i)
+ splitter2.AddKMers(counter.GetMergedKMersFname(i));
+ KMerDiskCounter<runtime_k::RtSeq> counter2(index.workdir(), splitter2);
+
+ BuildIndex(index, counter2, 16, nthreads);
+
+ // Build the kmer extensions
+ INFO("Building k-mer extensions from k+1-mers");
+# pragma omp parallel for num_threads(nthreads)
+ for (unsigned i = 0; i < nthreads; ++i)
+ FillExtensionsFromIndex(counter.GetMergedKMersFname(i), index);
+ INFO("Building k-mer extensions from k+1-mers finished.");
+
+ return splitter.stats();
+ }
+
+private:
+ DECL_LOGGER("DeBruijnExtensionIndexBuilder");
+};
+
+template<class Index>
+struct ExtensionIndexHelper {
+ using IndexT = Index;
+ typedef typename Index::traits_t traits_t;
+ typedef typename Index::KMer Kmer;
+ typedef typename Index::KMerIdx KMerIdx;
+ using DeBruijnExtensionIndexBuilderT = DeBruijnExtensionIndexBuilder;
+};
+
diff --git a/src/modules/data_structures/indices/kmer_splitters.hpp b/src/modules/data_structures/indices/kmer_splitters.hpp
index ba56656..9e35934 100644
--- a/src/modules/data_structures/indices/kmer_splitters.hpp
+++ b/src/modules/data_structures/indices/kmer_splitters.hpp
@@ -12,6 +12,7 @@
#include "dev_support/file_limit.hpp"
#include "data_structures/sequence/runtime_k.hpp"
+#include "data_structures/mph_index/kmer_index_builder.hpp"
namespace debruijn_graph {
@@ -35,73 +36,31 @@ struct StoringTypeFilter<InvertableStoring> {
}
};
-// used for temporary reads storage during parallel reading
-static const size_t READS_BUFFER_SIZE = 536870912; // 512 MB in bytes
-
-typedef ::KMerSplitter<runtime_k::RtSeq> RtSeqKMerSplitter;
-
-typedef KMerVector<runtime_k::RtSeq> RtSeqKMerVector;
-typedef std::vector<RtSeqKMerVector> KMerBuffer;
+using RtSeqKMerSplitter = ::KMerSortingSplitter<runtime_k::RtSeq>;
template<class KmerFilter>
class DeBruijnKMerSplitter : public RtSeqKMerSplitter {
private:
- bool skip_not_minimal_;
KmerFilter kmer_filter_;
protected:
size_t read_buffer_size_;
protected:
- size_t FillBufferFromSequence(const Sequence &seq,
- KMerBuffer &buffer, unsigned num_files) const {
- size_t kmers = 0;
-
+ bool FillBufferFromSequence(const Sequence &seq,
+ unsigned thread_id) {
if (seq.size() < this->K_)
- return kmers;
+ return false;
runtime_k::RtSeq kmer = seq.start<runtime_k::RtSeq>(this->K_) >> 'A';
+ bool stop = false;
for (size_t j = this->K_ - 1; j < seq.size(); ++j) {
kmer <<= seq[j];
- if(kmer_filter_.filter(kmer)) {
- buffer[this->GetFileNumForSeq(kmer, num_files)].push_back(kmer);
- kmers++;
- }
- }
- return kmers;
- }
-
-
- void DumpBuffers(size_t num_files, size_t nthreads,
- std::vector<KMerBuffer> &buffers,
- const path::files_t &ostreams) const{
- # pragma omp parallel for
- for (unsigned k = 0; k < num_files; ++k) {
- size_t sz = 0;
- for (size_t i = 0; i < nthreads; ++i)
- sz += buffers[i][k].size();
-
- KMerVector<runtime_k::RtSeq> SortBuffer(this->K_, sz);
- for (size_t i = 0; i < nthreads; ++i) {
- KMerBuffer &entry = buffers[i];
- for (size_t j = 0; j < entry[k].size(); ++j)
- SortBuffer.push_back(entry[k][j]);
- }
- libcxx::sort(SortBuffer.begin(), SortBuffer.end(), KMerVector<runtime_k::RtSeq>::less2_fast());
- auto it = std::unique(SortBuffer.begin(), SortBuffer.end(), KMerVector<runtime_k::RtSeq>::equal_to());
-
- # pragma omp critical
- {
- FILE *f = fopen(ostreams[k].c_str(), "ab");
- VERIFY_MSG(f, "Cannot open temporary file to write");
- fwrite(SortBuffer.data(), SortBuffer.el_data_size(), it - SortBuffer.begin(), f);
- fclose(f);
- }
- }
+ if (!kmer_filter_.filter(kmer))
+ continue;
- for (unsigned i = 0; i < nthreads; ++i) {
- for (unsigned j = 0; j < num_files; ++j) {
- buffers[i][j].clear();
- }
+ stop |= this->push_back_internal(kmer, thread_id);
}
+
+ return stop;
}
public:
@@ -126,12 +85,10 @@ class DeBruijnReadKMerSplitter : public DeBruijnKMerSplitter<KmerFilter> {
template<class ReadStream>
ReadStatistics
- FillBufferFromStream(ReadStream& stream,
- KMerBuffer &tmp_entries,
- unsigned num_files, size_t cell_size) const;
+ FillBufferFromStream(ReadStream& stream, unsigned thread_id);
ReadStatistics rs_;
-
+
public:
DeBruijnReadKMerSplitter(const std::string &work_dir,
unsigned K, uint32_t seed,
@@ -139,10 +96,9 @@ class DeBruijnReadKMerSplitter : public DeBruijnKMerSplitter<KmerFilter> {
io::SingleStream* contigs_stream = 0,
size_t read_buffer_size = 0)
: DeBruijnKMerSplitter<KmerFilter>(work_dir, K, KmerFilter(), read_buffer_size, seed),
- streams_(streams), contigs_(contigs_stream), rs_({0 ,0 ,0}) {
- }
+ streams_(streams), contigs_(contigs_stream), rs_({0 ,0 ,0}) {}
- virtual path::files_t Split(size_t num_files);
+ path::files_t Split(size_t num_files) override;
size_t read_length() const { return rs_.max_read_length_; }
ReadStatistics stats() const { return rs_; }
@@ -151,18 +107,18 @@ class DeBruijnReadKMerSplitter : public DeBruijnKMerSplitter<KmerFilter> {
template<class Read, class KmerFilter> template<class ReadStream>
ReadStatistics
DeBruijnReadKMerSplitter<Read, KmerFilter>::FillBufferFromStream(ReadStream &stream,
- KMerBuffer &buffer,
- unsigned num_files, size_t cell_size) const {
+ unsigned thread_id) {
typename ReadStream::ReadT r;
- size_t reads = 0, kmers = 0, rl = 0, bases = 0;
+ size_t reads = 0, rl = 0, bases = 0;
- while (!stream.eof() && kmers < num_files * cell_size) {
+ while (!stream.eof()) {
stream >> r;
rl = std::max(rl, r.size());
reads += 1;
bases += r.size();
- kmers += this->FillBufferFromSequence(r.sequence(), buffer, num_files);
+ if (this->FillBufferFromSequence(r.sequence(), thread_id))
+ break;
}
return { reads, rl, bases };
}
@@ -172,46 +128,14 @@ path::files_t DeBruijnReadKMerSplitter<Read, KmerFilter>::Split(size_t num_files
unsigned nthreads = (unsigned) streams_.size();
INFO("Splitting kmer instances into " << num_files << " buckets. This might take a while.");
-
- // Determine the set of output files
- path::files_t out;
- for (unsigned i = 0; i < num_files; ++i)
- out.push_back(this->GetRawKMersFname(i));
-
- size_t file_limit = num_files + 2*nthreads;
- size_t res = limit_file(file_limit);
- if (res < file_limit) {
- WARN("Failed to setup necessary limit for number of open files. The process might crash later on.");
- WARN("Do 'ulimit -n " << file_limit << "' in the console to overcome the limit");
- }
-
- size_t reads_buffer_size = DeBruijnKMerSplitter<KmerFilter>::read_buffer_size_;
- if (reads_buffer_size == 0) {
- reads_buffer_size = READS_BUFFER_SIZE;
- size_t mem_limit = (size_t)((double)(get_free_memory()) / (nthreads * 3));
- INFO("Memory available for splitting buffers: " << (double)mem_limit / 1024.0 / 1024.0 / 1024.0 << " Gb");
- reads_buffer_size = std::min(reads_buffer_size, mem_limit);
- }
- size_t cell_size = reads_buffer_size /
- (num_files * runtime_k::RtSeq::GetDataSize(this->K_) * sizeof(runtime_k::RtSeq::DataType));
-
- // Set sane minimum cell size
- if (cell_size < 16384)
- cell_size = 16384;
- INFO("Using cell size of " << cell_size);
-
- std::vector<KMerBuffer> tmp_entries(nthreads);
- for (unsigned i = 0; i < nthreads; ++i) {
- KMerBuffer &entry = tmp_entries[i];
- entry.resize(num_files, RtSeqKMerVector(this->K_, (size_t) (1.1 * (double) cell_size)));
- }
+ path::files_t out = this->PrepareBuffers(num_files, nthreads, this->read_buffer_size_);
size_t counter = 0, rl = 0, bases = 0, n = 15;
streams_.reset();
while (!streams_.eof()) {
# pragma omp parallel for num_threads(nthreads) reduction(+ : counter) reduction(+ : bases) shared(rl)
- for (size_t i = 0; i < nthreads; ++i) {
- ReadStatistics stats = FillBufferFromStream(streams_[i], tmp_entries[i], (unsigned) num_files, cell_size);
+ for (unsigned i = 0; i < nthreads; ++i) {
+ ReadStatistics stats = FillBufferFromStream(streams_[i], i);
counter += stats.reads_;
bases += stats.bases_;
@@ -224,7 +148,7 @@ path::files_t DeBruijnReadKMerSplitter<Read, KmerFilter>::Split(size_t num_files
}
}
- this->DumpBuffers(num_files, nthreads, tmp_entries, out);
+ this->DumpBuffers(out);
if (counter >> n) {
INFO("Processed " << counter << " reads");
@@ -234,11 +158,11 @@ path::files_t DeBruijnReadKMerSplitter<Read, KmerFilter>::Split(size_t num_files
if (contigs_) {
INFO("Adding contigs from previous K");
- size_t cnt = 0;
+ unsigned cnt = 0;
contigs_->reset();
while (!contigs_->eof()) {
- FillBufferFromStream(*contigs_, tmp_entries[cnt], (unsigned) num_files, cell_size);
- this->DumpBuffers(num_files, nthreads, tmp_entries, out);
+ FillBufferFromStream(*contigs_, cnt);
+ this->DumpBuffers(out);
if (++cnt >= nthreads)
cnt = 0;
}
@@ -258,29 +182,27 @@ class DeBruijnGraphKMerSplitter : public DeBruijnKMerSplitter<KmerFilter> {
const Graph &g_;
- size_t FillBufferFromEdges(EdgeIt &edge,
- KMerBuffer &tmp_entries,
- unsigned num_files, size_t cell_size) const;
+ size_t FillBufferFromEdges(EdgeIt &edge, unsigned thread_id);
public:
DeBruijnGraphKMerSplitter(const std::string &work_dir,
unsigned K, const Graph &g, size_t read_buffer_size = 0)
: DeBruijnKMerSplitter<KmerFilter>(work_dir, K, KmerFilter(), read_buffer_size), g_(g) {}
- virtual path::files_t Split(size_t num_files);
+ path::files_t Split(size_t num_files) override;
};
template<class Graph, class KmerFilter>
size_t
DeBruijnGraphKMerSplitter<Graph, KmerFilter>::FillBufferFromEdges(EdgeIt &edge,
- KMerBuffer &buffer,
- unsigned num_files, size_t cell_size) const {
+ unsigned thread_id) {
size_t seqs = 0;
- for (size_t kmers = 0; !edge.IsEnd() && kmers < num_files * cell_size; ++edge) {
+ for (; !edge.IsEnd(); ++edge) {
const Sequence &nucls = g_.EdgeNucls(*edge);
- kmers += this->FillBufferFromSequence(nucls, buffer, num_files);
seqs += 1;
+ if (this->FillBufferFromSequence(nucls, thread_id))
+ break;
}
return seqs;
@@ -290,38 +212,13 @@ template<class Graph, class KmerFilter>
path::files_t DeBruijnGraphKMerSplitter<Graph, KmerFilter>::Split(size_t num_files) {
INFO("Splitting kmer instances into " << num_files << " buckets. This might take a while.");
- // Determine the set of output files
- path::files_t out;
- for (unsigned i = 0; i < num_files; ++i)
- out.push_back(this->GetRawKMersFname(i));
-
- size_t file_limit = num_files + 2*16;
- size_t res = limit_file(file_limit);
- if (res < file_limit) {
- WARN("Failed to setup necessary limit for number of open files. The process might crash later on.");
- WARN("Do 'ulimit -n " << file_limit << "' in the console to overcome the limit");
- }
-
- size_t reads_buffer_size = DeBruijnKMerSplitter<KmerFilter>::read_buffer_size_;
- if (reads_buffer_size == 0) {
- reads_buffer_size = READS_BUFFER_SIZE;
- size_t mem_limit = (size_t)((double)(get_free_memory()) / (3));
- INFO("Memory available for splitting buffers: " << (double)mem_limit / 1024.0 / 1024.0 / 1024.0 << " Gb");
- reads_buffer_size = std::min(reads_buffer_size, mem_limit);
- }
- size_t cell_size = reads_buffer_size /
- (num_files * runtime_k::RtSeq::GetDataSize(this->K_) * sizeof(runtime_k::RtSeq::DataType));
- INFO("Using cell size of " << cell_size);
-
- std::vector<KMerBuffer> tmp_entries(1);
- KMerBuffer &entry = tmp_entries[0];
- entry.resize(num_files, RtSeqKMerVector(this->K_, (size_t) (1.1 * (double) cell_size)));
+ path::files_t out = this->PrepareBuffers(num_files, 1, this->read_buffer_size_);
size_t counter = 0, n = 10;
for (auto it = g_.ConstEdgeBegin(); !it.IsEnd(); ) {
- counter += FillBufferFromEdges(it, tmp_entries[0], (unsigned) num_files, cell_size);
+ counter += FillBufferFromEdges(it, 0);
- this->DumpBuffers(num_files, 1, tmp_entries, out);
+ this->DumpBuffers(out);
if (counter >> n) {
INFO("Processed " << counter << " edges");
@@ -344,73 +241,47 @@ class DeBruijnKMerKMerSplitter : public DeBruijnKMerSplitter<KmerFilter> {
bool add_rc_;
size_t FillBufferFromKMers(kmer_iterator &kmer,
- KMerBuffer &tmp_entries,
- unsigned num_files, size_t cell_size) const;
+ unsigned thread_id);
public:
DeBruijnKMerKMerSplitter(const std::string &work_dir,
unsigned K_target, unsigned K_source, bool add_rc, size_t read_buffer_size = 0)
- : DeBruijnKMerSplitter<KmerFilter>(work_dir, K_target, KmerFilter(), read_buffer_size), K_source_(K_source), add_rc_(add_rc) {}
+ : DeBruijnKMerSplitter<KmerFilter>(work_dir, K_target, KmerFilter(), read_buffer_size),
+ K_source_(K_source), add_rc_(add_rc) {}
void AddKMers(const std::string &file) {
kmers_.push_back(file);
}
- virtual path::files_t Split(size_t num_files);
+ path::files_t Split(size_t num_files) override;
};
template<class KmerFilter>
inline size_t DeBruijnKMerKMerSplitter<KmerFilter>::FillBufferFromKMers(kmer_iterator &kmer,
- KMerBuffer &buffer,
- unsigned num_files, size_t cell_size) const {
+ unsigned thread_id) {
size_t seqs = 0;
- for (size_t kmers = 0; kmer.good() && kmers < num_files * cell_size; ++kmer) {
+ for (; kmer.good(); ++kmer) {
Sequence nucls(runtime_k::RtSeq(K_source_, *kmer));
- kmers += this->FillBufferFromSequence(nucls, buffer, num_files);
- if(add_rc_)
- kmers += this->FillBufferFromSequence(!nucls, buffer, num_files);
seqs += 1;
+
+ bool stop = this->FillBufferFromSequence(nucls, thread_id);
+ if (add_rc_)
+ stop |= this->FillBufferFromSequence(!nucls, thread_id);
+
+ if (stop)
+ break;
}
return seqs;
}
template<class KmerFilter>
-inline path::files_t DeBruijnKMerKMerSplitter<KmerFilter>::Split(size_t num_files) {
+path::files_t DeBruijnKMerKMerSplitter<KmerFilter>::Split(size_t num_files) {
unsigned nthreads = (unsigned) kmers_.size();
- INFO("Splitting kmer instances into " << num_files << " buckets. This might take a while.");
- // Determine the set of output files
- path::files_t out;
- for (unsigned i = 0; i < num_files; ++i)
- out.push_back(this->GetRawKMersFname(i));
+ INFO("Splitting kmer instances into " << num_files << " buckets. This might take a while.");
- size_t file_limit = num_files + 2*nthreads;
- size_t res = limit_file(file_limit);
- if (res < file_limit) {
- WARN("Failed to setup necessary limit for number of open files. The process might crash later on.");
- WARN("Do 'ulimit -n " << file_limit << "' in the console to overcome the limit");
- }
-
- size_t reads_buffer_size = DeBruijnKMerSplitter<KmerFilter>::read_buffer_size_;
- if (reads_buffer_size == 0) {
- reads_buffer_size = READS_BUFFER_SIZE;
- size_t mem_limit = (size_t)((double)(get_free_memory()) / (nthreads * 3));
- INFO("Memory available for splitting buffers: " << (double)mem_limit / 1024.0 / 1024.0 / 1024.0 << " Gb");
- reads_buffer_size = std::min(reads_buffer_size, mem_limit);
- }
- size_t cell_size = reads_buffer_size /
- (num_files * runtime_k::RtSeq::GetDataSize(this->K_) * sizeof(runtime_k::RtSeq::DataType));
- // Set sane minimum cell size
- if (cell_size < 16384)
- cell_size = 16384;
- INFO("Using cell size of " << cell_size);
-
- std::vector<KMerBuffer> tmp_entries(nthreads);
- for (unsigned i = 0; i < nthreads; ++i) {
- KMerBuffer &entry = tmp_entries[i];
- entry.resize(num_files, RtSeqKMerVector(this->K_, (size_t) (1.1 * (double) cell_size)));
- }
+ path::files_t out = this->PrepareBuffers(num_files, nthreads, this->read_buffer_size_);
size_t counter = 0, n = 10;
std::vector<kmer_iterator> its;
@@ -418,23 +289,19 @@ inline path::files_t DeBruijnKMerKMerSplitter<KmerFilter>::Split(size_t num_file
for (auto it = kmers_.begin(), et = kmers_.end(); it != et; ++it)
its.emplace_back(*it, runtime_k::RtSeq::GetDataSize(K_source_));
- bool anygood = false;
- do {
+ while (std::any_of(its.begin(), its.end(),
+ [](const kmer_iterator &it) { return it.good(); })) {
# pragma omp parallel for num_threads(nthreads) reduction(+ : counter)
- for (size_t i = 0; i < nthreads; ++i)
- counter += FillBufferFromKMers(its[i], tmp_entries[i], (unsigned) num_files, cell_size);
+ for (unsigned i = 0; i < nthreads; ++i)
+ counter += FillBufferFromKMers(its[i], i);
- this->DumpBuffers(num_files, nthreads, tmp_entries, out);
+ this->DumpBuffers(out);
if (counter >> n) {
INFO("Processed " << counter << " kmers");
n += 1;
}
-
- anygood = false;
- for (auto it = its.begin(), et = its.end(); it != et; ++it)
- anygood |= it->good();
- } while (anygood);
+ }
INFO("Used " << counter << " kmers.");
diff --git a/src/modules/data_structures/indices/perfect_hash_map.hpp b/src/modules/data_structures/indices/perfect_hash_map.hpp
index 6a58abc..941acba 100644
--- a/src/modules/data_structures/indices/perfect_hash_map.hpp
+++ b/src/modules/data_structures/indices/perfect_hash_map.hpp
@@ -7,24 +7,18 @@
//***************************************************************************
#include "dev_support/openmp_wrapper.h"
-
-#include "io/reads_io/io_helper.hpp"
+#include "dev_support/path_helper.hpp"
+#include "io/kmers_io/kmer_iterator.hpp"
#include "data_structures/mph_index/kmer_index.hpp"
-#include "utils/adt/kmer_vector.hpp"
-
-#include "libcxx/sort.hpp"
-#include "kmer_splitters.hpp"
#include "key_with_hash.hpp"
#include "values.hpp"
#include "storing_traits.hpp"
#include <vector>
#include <cstdlib>
-#include <cstdio>
#include <cstdint>
-#include <io/kmers_io/kmer_iterator.hpp>
namespace debruijn_graph {
@@ -36,7 +30,7 @@ public:
typedef K KeyType;
typedef traits traits_t;
protected:
- typedef KMerIndex<traits> KMerIndexT;
+ typedef KMerIndex<traits> KMerIndexT;
//these fields are protected only for reduction of storage in edge indices BinWrite
KMerIndexT index_;
private:
@@ -84,7 +78,7 @@ public:
}
};
-template<class K, class V, class traits, class StoringType>
+template<class K, class V, class traits = kmer_index_traits<K>, class StoringType = SimpleStoring>
class PerfectHashMap : public ValueArray<V>, public IndexWrapper<K, traits> {
public:
typedef size_t IdxType;
@@ -142,20 +136,12 @@ public:
KeyBase::BinRead(reader, tmp);
ValueBase::BinRead(reader, tmp);
}
-//todo think more about hierarchy
-protected:
- template <class KmerCounter>
- void BuildIndex(KmerCounter& counter, size_t bucket_num, size_t thread_num, bool save_final = true) {
- KMerIndexBuilder<KMerIndexT> builder(this->workdir(),
- (unsigned) bucket_num,
- (unsigned) thread_num);
- size_t sz = builder.BuildIndex(index_, counter, save_final);
- ValueBase::resize(sz);
- }
+
+ friend struct PerfectHashMapBuilder;
};
-template<class K, class V, class traits, class StoringType>
+template<class K, class V, class traits = kmer_index_traits<K>, class StoringType = SimpleStoring>
class KeyStoringMap : public PerfectHashMap<K, V, traits, StoringType> {
private:
typedef PerfectHashMap<K, V, traits, StoringType> base;
@@ -170,7 +156,7 @@ public:
using base::ConstructKWH;
private:
- typename traits::FinalKMerStorage *kmers_;
+ std::unique_ptr<typename traits::FinalKMerStorage> kmers_;
void SortUniqueKMers() const {
size_t swaps = 0;
@@ -215,13 +201,9 @@ protected:
public:
KeyStoringMap(size_t k, const std::string &workdir)
- : base(k, workdir),
- kmers_(NULL) {
- }
+ : base(k, workdir), kmers_(nullptr) {}
- ~KeyStoringMap() {
- delete kmers_;
- }
+ ~KeyStoringMap() {}
KMer true_kmer(KeyWithHash kwh) const {
VERIFY(this->valid(kwh));
@@ -232,8 +214,7 @@ public:
void clear() {
base::clear();
- delete kmers_;
- kmers_ = NULL;
+ kmers_ = nullptr;
}
kmer_iterator kmer_begin() {
@@ -255,7 +236,7 @@ public:
return false;
auto it = this->kmers_->begin() + kwh.idx();
- if(!kwh.is_minimal())
+ if (!kwh.is_minimal())
return (typename traits_t::raw_equal_to()(!kwh.key(), *it));
else
return (typename traits_t::raw_equal_to()(kwh.key(), *it));
@@ -298,18 +279,10 @@ public:
return res;
}
- template<class KmerCounter>
- void BuildIndex(KmerCounter& counter, size_t bucket_num,
- size_t thread_num) {
- base::BuildIndex(counter, bucket_num, thread_num);
- VERIFY(!kmers_);
- kmers_ = counter.GetFinalKMers();
- VERIFY(kmers_);
- SortUniqueKMers();
- }
+ friend struct KeyStoringIndexBuilder;
};
-template<class K, class V, class traits, class StoringType>
+template<class K, class V, class traits = kmer_index_traits<K>, class StoringType = SimpleStoring>
class KeyIteratingMap : public PerfectHashMap<K, V, traits, StoringType> {
typedef PerfectHashMap<K, V, traits, StoringType> base;
@@ -325,12 +298,9 @@ public:
public:
KeyIteratingMap(size_t k, const std::string &workdir)
- : base(k, workdir),
- KMersFilename_("") {
- }
+ : base(k, workdir), KMersFilename_("") {}
- ~KeyIteratingMap() {
- }
+ ~KeyIteratingMap() {}
typedef MMappedFileRecordArrayIterator<typename KMer::DataType> kmer_iterator;
@@ -342,55 +312,7 @@ public:
return io::make_kmer_iterator<KMer>(this->KMersFilename_, base::k(), parts);
}
-
- template<class KmerCounter>
- void BuildIndex(KmerCounter& counter, size_t bucket_num,
- size_t thread_num) {
- base::BuildIndex(counter, bucket_num, thread_num);
- KMersFilename_ = counter.GetFinalKMersFname();
- }
-};
-
-//Seq is here for partial specialization
-template <class Seq, class Index>
-class DeBruijnStreamKMerIndexBuilder {
-
-};
-
-template<class Index>
-class DeBruijnStreamKMerIndexBuilder<runtime_k::RtSeq, Index> {
- public:
- typedef Index IndexT;
-
- template <class Streams>
- size_t BuildIndexFromStream(IndexT &index,
- Streams &streams,
- io::SingleStream* contigs_stream = 0) const {
- DeBruijnReadKMerSplitter<typename Streams::ReadT, StoringTypeFilter<typename IndexT::storing_type>>
- splitter(index.workdir(), index.k(), 0, streams, contigs_stream);
- KMerDiskCounter<runtime_k::RtSeq> counter(index.workdir(), splitter);
-
- index.BuildIndex(counter, 16, streams.size());
- return 0;
- }
-};
-
-//fixme makes hierarchy a bit strange
-template <class Index, class Seq = typename Index::KMer>
-class DeBruijnGraphKMerIndexBuilder;
-
-template <class Index>
-class DeBruijnGraphKMerIndexBuilder<Index, runtime_k::RtSeq> {
- public:
- typedef Index IndexT;
-
- template<class Graph>
- void BuildIndexFromGraph(IndexT &index, const Graph &g, size_t read_buffer_size = 0) const {
- DeBruijnGraphKMerSplitter<Graph, StoringTypeFilter<typename Index::storing_type>> splitter(index.workdir(), index.k(),
- g, read_buffer_size);
- KMerDiskCounter<runtime_k::RtSeq> counter(index.workdir(), splitter);
- index.BuildIndex(counter, 16, 1);
- }
+ friend struct KeyIteratingIndexBuilder;
};
}
diff --git a/src/modules/data_structures/indices/perfect_hash_map_builder.hpp b/src/modules/data_structures/indices/perfect_hash_map_builder.hpp
new file mode 100644
index 0000000..b94a596
--- /dev/null
+++ b/src/modules/data_structures/indices/perfect_hash_map_builder.hpp
@@ -0,0 +1,102 @@
+#pragma once
+//***************************************************************************
+//* Copyright (c) 2016 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "data_structures/mph_index/kmer_index_builder.hpp"
+
+#include "perfect_hash_map.hpp"
+#include "kmer_splitters.hpp"
+
+namespace debruijn_graph {
+
+struct PerfectHashMapBuilder {
+ template<class K, class V, class traits, class StoringType, class Counter>
+ void BuildIndex(PerfectHashMap<K, V, traits, StoringType> &index,
+ Counter& counter, size_t bucket_num,
+ size_t thread_num, bool save_final = true) const {
+ using KMerIndex = typename PerfectHashMap<K, V, traits, StoringType>::KMerIndexT;
+
+ KMerIndexBuilder<KMerIndex> builder(index.workdir(),
+ (unsigned) bucket_num,
+ (unsigned) thread_num);
+ size_t sz = builder.BuildIndex(index.index_, counter, save_final);
+ index.resize(sz);
+ }
+};
+
+struct KeyStoringIndexBuilder {
+ template<class K, class V, class traits, class StoringType, class Counter>
+ void BuildIndex(KeyStoringMap<K, V, traits, StoringType> &index,
+ Counter& counter, size_t bucket_num,
+ size_t thread_num, bool save_final = true) const {
+ phm_builder_.BuildIndex(index, counter, bucket_num, thread_num, save_final);
+ VERIFY(!index.kmers_.get());
+ index.kmers_ = counter.GetFinalKMers();
+ VERIFY(index.kmers_.get());
+ index.SortUniqueKMers();
+ }
+
+ private:
+ PerfectHashMapBuilder phm_builder_;
+};
+
+struct KeyIteratingIndexBuilder {
+ template<class K, class V, class traits, class StoringType, class Counter>
+ void BuildIndex(KeyIteratingMap<K, V, traits, StoringType> &index,
+ Counter& counter, size_t bucket_num,
+ size_t thread_num, bool save_final = true) const {
+ phm_builder_.BuildIndex(index, counter, bucket_num, thread_num, save_final);
+ index.KMersFilename_ = counter.GetFinalKMersFname();
+ }
+
+ private:
+ PerfectHashMapBuilder phm_builder_;
+};
+
+template<class K, class V, class traits, class StoringType, class Counter>
+void BuildIndex(KeyIteratingMap<K, V, traits, StoringType> &index,
+ Counter& counter, size_t bucket_num,
+ size_t thread_num, bool save_final = true) {
+ KeyIteratingIndexBuilder().BuildIndex(index, counter, bucket_num, thread_num, save_final);
+}
+
+
+template<class K, class V, class traits, class StoringType, class Counter>
+void BuildIndex(KeyStoringMap<K, V, traits, StoringType> &index,
+ Counter& counter, size_t bucket_num,
+ size_t thread_num, bool save_final = true) {
+ KeyStoringIndexBuilder().BuildIndex(index, counter, bucket_num, thread_num, save_final);
+}
+
+template<class K, class V, class traits, class StoringType, class Counter>
+void BuildIndex(PerfectHashMap<K, V, traits, StoringType> &index,
+ Counter& counter, size_t bucket_num,
+ size_t thread_num, bool save_final = true) {
+ PerfectHashMapBuilder().BuildIndex(index, counter, bucket_num, thread_num, save_final);
+}
+
+template<class Index, class Streams>
+size_t BuildIndexFromStream(Index &index,
+ Streams &streams,
+ io::SingleStream* contigs_stream = 0) {
+ DeBruijnReadKMerSplitter<typename Streams::ReadT,
+ StoringTypeFilter<typename Index::storing_type>>
+ splitter(index.workdir(), index.k(), 0, streams, contigs_stream);
+ KMerDiskCounter<runtime_k::RtSeq> counter(index.workdir(), splitter);
+ BuildIndex(index, counter, 16, streams.size());
+ return 0;
+}
+
+template<class Index, class Graph>
+void BuildIndexFromGraph(Index &index, const Graph &g, size_t read_buffer_size = 0) {
+ DeBruijnGraphKMerSplitter<Graph,
+ StoringTypeFilter<typename Index::storing_type>>
+ splitter(index.workdir(), index.k(), g, read_buffer_size);
+ KMerDiskCounter<runtime_k::RtSeq> counter(index.workdir(), splitter);
+ BuildIndex(index, counter, 16, 1);
+}
+
+}
diff --git a/src/modules/data_structures/mph_index/kmer_index.hpp b/src/modules/data_structures/mph_index/kmer_index.hpp
index 16d2c66..28b429d 100644
--- a/src/modules/data_structures/mph_index/kmer_index.hpp
+++ b/src/modules/data_structures/mph_index/kmer_index.hpp
@@ -6,119 +6,17 @@
//* See file LICENSE for details.
//***************************************************************************
-#include "io/kmers_io/mmapped_reader.hpp"
-#include "io/kmers_io/mmapped_writer.hpp"
-#include "utils/adt/pointer_iterator.hpp"
-
#include "mphf.hpp"
#include "base_hash.hpp"
-#include "hypergraph.hpp"
-#include "hypergraph_sorter_seq.hpp"
-
-#include "dev_support/openmp_wrapper.h"
-
-#include "dev_support/logger/logger.hpp"
-#include "dev_support/path_helper.hpp"
-
-#include "dev_support/memory_limit.hpp"
-#include <libcxx/sort.hpp>
+#include "kmer_index_traits.hpp"
-#include <algorithm>
-#ifdef USE_GLIBCXX_PARALLEL
-#include <parallel/algorithm>
-#endif
-#include <fstream>
#include <vector>
#include <cmath>
-#include "config.hpp"
-
-#ifdef SPADES_USE_JEMALLOC
-# include <jemalloc/jemalloc.h>
-#endif
-
template<class Index>
class KMerIndexBuilder;
-template<class Seq>
-struct kmer_index_traits {
- typedef Seq SeqType;
- typedef MMappedRecordArrayReader<typename Seq::DataType> RawKMerStorage;
- typedef MMappedRecordArrayReader<typename Seq::DataType> FinalKMerStorage;
- typedef typename RawKMerStorage::iterator raw_data_iterator;
- typedef typename RawKMerStorage::const_iterator raw_data_const_iterator;
- typedef typename RawKMerStorage::iterator::value_type KMerRawData;
- typedef typename RawKMerStorage::iterator::reference KMerRawReference;
- typedef typename RawKMerStorage::const_iterator::reference KMerRawConstReference;
-
- struct raw_equal_to {
- bool operator()(const Seq &lhs, const KMerRawReference rhs) {
- return (array_equal_to<typename Seq::DataType>()(lhs.data(), lhs.data_size(), rhs));
- }
- };
-
- struct raw_create {
- Seq operator()(unsigned K, const KMerRawReference kmer) {
- return Seq(K, kmer.data());
- }
- Seq operator()(unsigned K, const KMerRawConstReference kmer) {
- return Seq(K, kmer.data());
- }
- };
-
- struct hash_function {
- uint64_t operator()(const Seq &k) const{
- return typename Seq::hash()(k);
- }
- uint64_t operator()(const KMerRawReference k) const {
- return typename Seq::hash()(k.data(), k.size());
- }
- };
-
- struct KMerRawReferenceAdaptor {
- emphf::byte_range_t operator()(const KMerRawReference k) const {
- const uint8_t * data = (const uint8_t*)k.data();
- return std::make_pair(data, data + k.data_size());
- }
- };
-
- struct KMerSeqAdaptor {
- emphf::byte_range_t operator()(const Seq &k) const {
- const uint8_t * data = (const uint8_t*)k.data();
- return std::make_pair(data, data + k.data_size() * sizeof(typename Seq::DataType));
- }
- };
-
- template<class Writer>
- static void raw_serialize(Writer &writer, RawKMerStorage *data) {
- size_t sz = data->data_size(), elcnt = data->elcnt();
- unsigned PageSize = getpagesize();
- writer.write((char*)&sz, sizeof(sz));
- writer.write((char*)&elcnt, sizeof(elcnt));
- // Make sure data is aligned to the page boundary
- size_t cpos = writer.tellp();
- size_t pos = (cpos + PageSize - 1 + sizeof(size_t)) / PageSize * PageSize;
- size_t off = pos - writer.tellp();
- writer.write((char*)&off, sizeof(off));
- writer.seekp(pos);
- writer.write((char*)data->data(), data->data_size());
- }
-
- template<class Reader>
- static RawKMerStorage *raw_deserialize(Reader &reader, const std::string &FileName) {
- size_t sz, off, elcnt;
- reader.read((char*)&sz, sizeof(sz));
- reader.read((char*)&elcnt, sizeof(elcnt));
- reader.read((char*)&off, sizeof(off));
- off -= sizeof(off);
- off += reader.tellg();
-
- return new RawKMerStorage(FileName, elcnt, false, off, sz);
- }
-
-};
-
template<class traits>
class KMerIndex {
public:
@@ -229,302 +127,3 @@ class KMerIndex {
friend class KMerIndexBuilder<__self>;
};
-
-template<class Seq>
-class KMerSplitter {
- public:
- typedef typename Seq::hash hash_function;
-
- KMerSplitter(const std::string &work_dir, unsigned K, uint32_t seed = 0)
- : work_dir_(work_dir), K_(K), seed_(seed) {}
-
- virtual ~KMerSplitter() {}
-
- virtual path::files_t Split(size_t num_files) = 0;
-
- unsigned K() const { return K_; }
-
- protected:
- const std::string &work_dir_;
- hash_function hash_;
- unsigned K_;
- uint32_t seed_;
-
- std::string GetRawKMersFname(unsigned suffix) const {
- return path::append_path(work_dir_, "kmers.raw." + std::to_string(suffix));
- }
-
- unsigned GetFileNumForSeq(const Seq &s, unsigned total) const {
- return (unsigned)(hash_(s, seed_) % total);
- }
-
- DECL_LOGGER("K-mer Splitting");
-};
-
-template<class Seq, class traits = kmer_index_traits<Seq> >
-class KMerCounter {
- public:
- typedef typename traits::raw_data_iterator iterator;
- typedef typename traits::raw_data_const_iterator const_iterator;
- typedef typename traits::RawKMerStorage RawKMerStorage;
- typedef typename traits::FinalKMerStorage FinalKMerStorage;
-
- virtual size_t KMerSize() const = 0;
-
- virtual size_t Count(unsigned num_buckets, unsigned num_threads) = 0;
- virtual size_t CountAll(unsigned num_buckets, unsigned num_threads, bool merge = true) = 0;
- virtual void MergeBuckets(unsigned num_buckets) = 0;
-
- virtual void OpenBucket(size_t idx, bool unlink = true) = 0;
- virtual void ReleaseBucket(size_t idx) = 0;
- virtual RawKMerStorage* TransferBucket(size_t idx) = 0;
- virtual FinalKMerStorage* GetFinalKMers() = 0;
-
- virtual iterator bucket_begin(size_t idx) = 0;
- virtual iterator bucket_end(size_t idx) = 0;
-
- virtual ~KMerCounter() {}
-
-protected:
- DECL_LOGGER("K-mer Counting");
-};
-
-template<class Seq, class traits = kmer_index_traits<Seq> >
-class KMerDiskCounter : public KMerCounter<Seq> {
- typedef KMerCounter<Seq, traits> __super;
-public:
- KMerDiskCounter(const std::string &work_dir, KMerSplitter<Seq> &splitter)
- : work_dir_(work_dir), splitter_(splitter) {
- std::string prefix = path::append_path(work_dir, "kmers_XXXXXX");
- char *tempprefix = strcpy(new char[prefix.length() + 1], prefix.c_str());
- VERIFY_MSG(-1 != (fd_ = ::mkstemp(tempprefix)), "Cannot create temporary file");
- kmer_prefix_ = tempprefix;
- delete[] tempprefix;
- }
-
- ~KMerDiskCounter() {
- for (size_t i = 0; i < buckets_.size(); ++i)
- ReleaseBucket(i);
-
- ::close(fd_);
- ::unlink(kmer_prefix_.c_str());
- }
-
- size_t KMerSize() const {
- return Seq::GetDataSize(splitter_.K()) * sizeof(typename Seq::DataType);
- }
-
- void OpenBucket(size_t idx, bool unlink = true) {
- unsigned K = splitter_.K();
-
- buckets_[idx] = new MMappedRecordArrayReader<typename Seq::DataType>(GetMergedKMersFname((unsigned)idx), Seq::GetDataSize(K), unlink);
- }
-
- void ReleaseBucket(size_t idx) {
- delete buckets_[idx];
- buckets_[idx] = NULL;
- }
-
- MMappedRecordArrayReader<typename Seq::DataType>* TransferBucket(size_t idx) {
- MMappedRecordArrayReader<typename Seq::DataType> *res = buckets_[idx];
- buckets_[idx] = NULL;
-
- return res;
- }
-
- typename __super::iterator bucket_begin(size_t idx) {
- return buckets_[idx]->begin();
- }
- typename __super::iterator bucket_end(size_t idx) {
- return buckets_[idx]->end();
- }
-
- size_t Count(unsigned num_buckets, unsigned num_threads) {
- unsigned K = splitter_.K();
-
- // Split k-mers into buckets.
- path::files_t raw_kmers = splitter_.Split(num_buckets * num_threads);
-
- INFO("Starting k-mer counting.");
- size_t kmers = 0;
-# pragma omp parallel for shared(raw_kmers) num_threads(num_threads) schedule(dynamic) reduction(+:kmers)
- for (unsigned iFile = 0; iFile < raw_kmers.size(); ++iFile) {
- kmers += MergeKMers(raw_kmers[iFile], GetUniqueKMersFname(iFile), K);
- }
- INFO("K-mer counting done. There are " << kmers << " kmers in total. ");
-
- INFO("Merging temporary buckets.");
- for (unsigned i = 0; i < num_buckets; ++i) {
- std::string ofname = GetMergedKMersFname(i);
- std::ofstream ofs(ofname.c_str(), std::ios::out | std::ios::binary);
- for (unsigned j = 0; j < num_threads; ++j) {
- MMappedRecordArrayReader<typename Seq::DataType> ins(GetUniqueKMersFname(i + j * num_buckets), Seq::GetDataSize(K), /* unlink */ true);
- ofs.write((const char*)ins.data(), ins.data_size());
- }
- }
-
- buckets_.resize(num_buckets);
-
- return kmers;
- }
-
- void MergeBuckets(unsigned num_buckets) {
- unsigned K = splitter_.K();
-
- INFO("Merging final buckets.");
- for (unsigned i = 0; i < num_buckets; ++i)
- VERIFY(buckets_[i] == NULL);
-
- buckets_.clear();
-
- MMappedRecordArrayWriter<typename Seq::DataType> os(GetFinalKMersFname(), Seq::GetDataSize(K));
- std::string ofname = GetFinalKMersFname();
- std::ofstream ofs(ofname.c_str(), std::ios::out | std::ios::binary);
- for (unsigned j = 0; j < num_buckets; ++j) {
- MMappedRecordArrayReader<typename Seq::DataType> ins(GetMergedKMersFname(j), Seq::GetDataSize(K), /* unlink */ true);
- ofs.write((const char*)ins.data(), ins.data_size());
- }
- ofs.close();
- }
-
- size_t CountAll(unsigned num_buckets, unsigned num_threads, bool merge = true) {
- size_t kmers = Count(num_buckets, num_threads);
- if (merge)
- MergeBuckets(num_buckets);
-
- return kmers;
- }
-
- typename __super::FinalKMerStorage *GetFinalKMers() {
- unsigned K = splitter_.K();
- return new MMappedRecordArrayReader<typename Seq::DataType>(GetFinalKMersFname(), Seq::GetDataSize(K), /* unlink */ true);
- }
-
- std::string GetMergedKMersFname(unsigned suffix) const {
- return kmer_prefix_ + ".merged." + std::to_string(suffix);
- }
-
- std::string GetFinalKMersFname() const {
- return kmer_prefix_ + ".final";
- }
-
-private:
- std::string work_dir_;
- KMerSplitter<Seq> &splitter_;
- int fd_;
- std::string kmer_prefix_;
-
- std::vector<MMappedRecordArrayReader<typename Seq::DataType>*> buckets_;
-
- std::string GetUniqueKMersFname(unsigned suffix) const {
- return kmer_prefix_ + ".unique." + std::to_string(suffix);
- }
-
- size_t MergeKMers(const std::string &ifname, const std::string &ofname,
- unsigned K) {
- MMappedRecordArrayReader<typename Seq::DataType> ins(ifname, Seq::GetDataSize(K), /* unlink */ true);
-
- // Sort the stuff
- libcxx::sort(ins.begin(), ins.end(), array_less<typename Seq::DataType>());
-
- // FIXME: Use something like parallel version of unique_copy but with explicit
- // resizing.
- auto it = std::unique(ins.begin(), ins.end(), array_equal_to<typename Seq::DataType>());
-
- MMappedRecordArrayWriter<typename Seq::DataType> os(ofname, Seq::GetDataSize(K));
- os.resize(it - ins.begin());
- std::copy(ins.begin(), it, os.begin());
-
- return it - ins.begin();
- }
-};
-
-template<class Index>
-class KMerIndexBuilder {
- typedef typename Index::KMerSeq Seq;
- typedef typename Index::kmer_index_traits kmer_index_traits;
-
- std::string work_dir_;
- unsigned num_buckets_;
- unsigned num_threads_;
-
- public:
- KMerIndexBuilder(const std::string &workdir,
- unsigned num_buckets, unsigned num_threads)
- : work_dir_(workdir), num_buckets_(num_buckets), num_threads_(num_threads) {}
- size_t BuildIndex(Index &out, KMerCounter<Seq> &counter,
- bool save_final = false);
-
- unsigned num_buckets() const { return num_buckets_; }
-
- private:
-
- DECL_LOGGER("K-mer Index Building");
-};
-
-template<class Index>
-size_t KMerIndexBuilder<Index>::BuildIndex(Index &index, KMerCounter<Seq> &counter,
- bool save_final) {
- index.clear();
-
- INFO("Building kmer index ");
-
- // First, count the unique k-mers
- size_t kmers = counter.Count(num_buckets_, num_threads_);
-
- index.num_buckets_ = num_buckets_;
- index.bucket_starts_.resize(num_buckets_ + 1);
- index.index_ = new typename KMerIndex<kmer_index_traits>::KMerDataIndex[num_buckets_];
-
- INFO("Building perfect hash indices");
-
- // Index building requires up to 40 bytes per k-mer. Limit number of threads depending on the memory limit.
- unsigned num_threads = num_threads_;
-# ifdef SPADES_USE_JEMALLOC
- const size_t *cmem = 0;
- size_t clen = sizeof(cmem);
-
- je_mallctl("stats.cactive", &cmem, &clen, NULL, 0);
- size_t bucket_size = (36 * kmers + kmers * counter.KMerSize()) / num_buckets_;
- num_threads = std::min<unsigned>((unsigned) ((get_memory_limit() - *cmem) / bucket_size), num_threads);
- if (num_threads < 1)
- num_threads = 1;
- if (num_threads < num_threads_)
- WARN("Number of threads was limited down to " << num_threads << " in order to fit the memory limits during the index construction");
-# endif
-
-# pragma omp parallel for shared(index) num_threads(num_threads)
- for (unsigned iFile = 0; iFile < num_buckets_; ++iFile) {
- typename KMerIndex<kmer_index_traits>::KMerDataIndex &data_index = index.index_[iFile];
- counter.OpenBucket(iFile, !save_final);
- size_t sz = counter.bucket_end(iFile) - counter.bucket_begin(iFile);
- index.bucket_starts_[iFile + 1] = sz;
- typename kmer_index_traits::KMerRawReferenceAdaptor adaptor;
- size_t max_nodes = (size_t(std::ceil(double(sz) * 1.23)) + 2) / 3 * 3;
- if (max_nodes >= uint64_t(1) << 32) {
- emphf::hypergraph_sorter_seq<emphf::hypergraph<uint64_t> > sorter;
- typename KMerIndex<kmer_index_traits>::KMerDataIndex(sorter,
- sz, emphf::range(counter.bucket_begin(iFile), counter.bucket_end(iFile)),
- adaptor).swap(data_index);
- } else {
- emphf::hypergraph_sorter_seq<emphf::hypergraph<uint32_t> > sorter;
- typename KMerIndex<kmer_index_traits>::KMerDataIndex(sorter,
- sz, emphf::range(counter.bucket_begin(iFile), counter.bucket_end(iFile)),
- adaptor).swap(data_index);
- }
-
- counter.ReleaseBucket(iFile);
- }
-
- // Finally, record the sizes of buckets.
- for (unsigned iFile = 1; iFile < num_buckets_; ++iFile)
- index.bucket_starts_[iFile] += index.bucket_starts_[iFile - 1];
-
- if (save_final)
- counter.MergeBuckets(num_buckets_);
-
- double bits_per_kmer = 8.0 * (double)index.mem_size() / (double)kmers;
- INFO("Index built. Total " << index.mem_size() << " bytes occupied (" << bits_per_kmer << " bits per kmer).");
- index.count_size();
- return kmers;
-}
diff --git a/src/modules/data_structures/mph_index/kmer_index.hpp b/src/modules/data_structures/mph_index/kmer_index_builder.hpp
similarity index 53%
copy from src/modules/data_structures/mph_index/kmer_index.hpp
copy to src/modules/data_structures/mph_index/kmer_index_builder.hpp
index 16d2c66..9993ba1 100644
--- a/src/modules/data_structures/mph_index/kmer_index.hpp
+++ b/src/modules/data_structures/mph_index/kmer_index_builder.hpp
@@ -6,14 +6,12 @@
//* See file LICENSE for details.
//***************************************************************************
+#include "kmer_index.hpp"
+
#include "io/kmers_io/mmapped_reader.hpp"
#include "io/kmers_io/mmapped_writer.hpp"
#include "utils/adt/pointer_iterator.hpp"
-
-#include "mphf.hpp"
-#include "base_hash.hpp"
-#include "hypergraph.hpp"
-#include "hypergraph_sorter_seq.hpp"
+#include "utils/adt/kmer_vector.hpp"
#include "dev_support/openmp_wrapper.h"
@@ -21,6 +19,12 @@
#include "dev_support/path_helper.hpp"
#include "dev_support/memory_limit.hpp"
+#include "dev_support/file_limit.hpp"
+
+#include "mphf.hpp"
+#include "base_hash.hpp"
+#include "hypergraph.hpp"
+#include "hypergraph_sorter_seq.hpp"
#include <libcxx/sort.hpp>
@@ -28,9 +32,6 @@
#ifdef USE_GLIBCXX_PARALLEL
#include <parallel/algorithm>
#endif
-#include <fstream>
-#include <vector>
-#include <cmath>
#include "config.hpp"
@@ -38,197 +39,9 @@
# include <jemalloc/jemalloc.h>
#endif
-template<class Index>
-class KMerIndexBuilder;
-
-template<class Seq>
-struct kmer_index_traits {
- typedef Seq SeqType;
- typedef MMappedRecordArrayReader<typename Seq::DataType> RawKMerStorage;
- typedef MMappedRecordArrayReader<typename Seq::DataType> FinalKMerStorage;
- typedef typename RawKMerStorage::iterator raw_data_iterator;
- typedef typename RawKMerStorage::const_iterator raw_data_const_iterator;
- typedef typename RawKMerStorage::iterator::value_type KMerRawData;
- typedef typename RawKMerStorage::iterator::reference KMerRawReference;
- typedef typename RawKMerStorage::const_iterator::reference KMerRawConstReference;
-
- struct raw_equal_to {
- bool operator()(const Seq &lhs, const KMerRawReference rhs) {
- return (array_equal_to<typename Seq::DataType>()(lhs.data(), lhs.data_size(), rhs));
- }
- };
-
- struct raw_create {
- Seq operator()(unsigned K, const KMerRawReference kmer) {
- return Seq(K, kmer.data());
- }
- Seq operator()(unsigned K, const KMerRawConstReference kmer) {
- return Seq(K, kmer.data());
- }
- };
-
- struct hash_function {
- uint64_t operator()(const Seq &k) const{
- return typename Seq::hash()(k);
- }
- uint64_t operator()(const KMerRawReference k) const {
- return typename Seq::hash()(k.data(), k.size());
- }
- };
-
- struct KMerRawReferenceAdaptor {
- emphf::byte_range_t operator()(const KMerRawReference k) const {
- const uint8_t * data = (const uint8_t*)k.data();
- return std::make_pair(data, data + k.data_size());
- }
- };
-
- struct KMerSeqAdaptor {
- emphf::byte_range_t operator()(const Seq &k) const {
- const uint8_t * data = (const uint8_t*)k.data();
- return std::make_pair(data, data + k.data_size() * sizeof(typename Seq::DataType));
- }
- };
-
- template<class Writer>
- static void raw_serialize(Writer &writer, RawKMerStorage *data) {
- size_t sz = data->data_size(), elcnt = data->elcnt();
- unsigned PageSize = getpagesize();
- writer.write((char*)&sz, sizeof(sz));
- writer.write((char*)&elcnt, sizeof(elcnt));
- // Make sure data is aligned to the page boundary
- size_t cpos = writer.tellp();
- size_t pos = (cpos + PageSize - 1 + sizeof(size_t)) / PageSize * PageSize;
- size_t off = pos - writer.tellp();
- writer.write((char*)&off, sizeof(off));
- writer.seekp(pos);
- writer.write((char*)data->data(), data->data_size());
- }
-
- template<class Reader>
- static RawKMerStorage *raw_deserialize(Reader &reader, const std::string &FileName) {
- size_t sz, off, elcnt;
- reader.read((char*)&sz, sizeof(sz));
- reader.read((char*)&elcnt, sizeof(elcnt));
- reader.read((char*)&off, sizeof(off));
- off -= sizeof(off);
- off += reader.tellg();
-
- return new RawKMerStorage(FileName, elcnt, false, off, sz);
- }
-
-};
-
-template<class traits>
-class KMerIndex {
- public:
- typedef traits kmer_index_traits;
- typedef typename traits::SeqType KMerSeq;
- typedef typename traits::hash_function hash_function;
- typedef typename traits::KMerRawData KMerRawData;
- typedef typename traits::KMerRawReference KMerRawReference;
- typedef size_t IdxType;
-
- private:
- using KMerDataIndex = emphf::mphf<emphf::city_hasher>;
- typedef KMerIndex __self;
-
- public:
- KMerIndex(): index_(NULL), num_buckets_(0), size_(0) {}
-
- KMerIndex(const KMerIndex&) = delete;
- KMerIndex& operator=(const KMerIndex&) = delete;
-
- ~KMerIndex() { clear(); }
-
- void clear() {
- num_buckets_ = 0;
- bucket_starts_.clear();
-
- delete[] index_;
- index_ = NULL;
- }
-
- size_t mem_size() {
- size_t sz = 0;
- for (size_t i = 0; i < num_buckets_; ++i)
- sz += index_[i].mem_size();
-
- return sz;
- }
-
- void count_size() {
- if (index_ == NULL)
- return;
- size_ = 0;
- for (size_t i = 0; i < num_buckets_; i++)
- size_ += index_[i].size();
- }
-
- size_t size() const {
- return size_;
- }
-
- size_t seq_idx(const KMerSeq &s) const {
- size_t bucket = seq_bucket(s);
-
- return bucket_starts_[bucket] +
- index_[bucket].lookup(s, typename traits::KMerSeqAdaptor());
- }
-
- size_t raw_seq_idx(const KMerRawReference data) const {
- size_t bucket = raw_seq_bucket(data);
-
- return bucket_starts_[bucket] +
- index_[bucket].lookup(data, typename traits::KMerRawReferenceAdaptor());
- }
-
- template<class Writer>
- void serialize(Writer &os) const {
- os.write((char*)&num_buckets_, sizeof(num_buckets_));
- for (size_t i = 0; i < num_buckets_; ++i)
- index_[i].save(os);
- os.write((char*)&bucket_starts_[0], (num_buckets_ + 1) * sizeof(bucket_starts_[0]));
- }
-
- template<class Reader>
- void deserialize(Reader &is) {
- clear();
-
- is.read((char*)&num_buckets_, sizeof(num_buckets_));
-
- index_ = new KMerDataIndex[num_buckets_];
- for (size_t i = 0; i < num_buckets_; ++i)
- index_[i].load(is);
-
- bucket_starts_.resize(num_buckets_ + 1);
- is.read((char*)&bucket_starts_[0], (num_buckets_ + 1) * sizeof(bucket_starts_[0]));
- count_size();
- }
-
- void swap(KMerIndex<traits> &other) {
- std::swap(index_, other.index_);
- std::swap(num_buckets_, other.num_buckets_);
- std::swap(size_, other.size_);
- std::swap(bucket_starts_, other.bucket_starts_);
- }
-
- private:
- KMerDataIndex *index_;
-
- size_t num_buckets_;
- std::vector<size_t> bucket_starts_;
- size_t size_;
-
- size_t seq_bucket(const KMerSeq &s) const {
- return hash_function()(s) % num_buckets_;
- }
- size_t raw_seq_bucket(const KMerRawReference data) const {
- return hash_function()(data) % num_buckets_;
- }
-
- friend class KMerIndexBuilder<__self>;
-};
+#include <fstream>
+#include <vector>
+#include <cmath>
template<class Seq>
class KMerSplitter {
@@ -242,6 +55,10 @@ class KMerSplitter {
virtual path::files_t Split(size_t num_files) = 0;
+ size_t kmer_size() const {
+ return Seq::GetDataSize(K_) * sizeof(typename Seq::DataType);
+ }
+
unsigned K() const { return K_; }
protected:
@@ -250,15 +67,109 @@ class KMerSplitter {
unsigned K_;
uint32_t seed_;
+ DECL_LOGGER("K-mer Splitting");
+};
+
+template<class Seq>
+class KMerSortingSplitter : public KMerSplitter<Seq> {
+ public:
+ KMerSortingSplitter(const std::string &work_dir, unsigned K, uint32_t seed = 0)
+ : KMerSplitter<Seq>(work_dir, K, seed), cell_size_(0), num_files_(0) {}
+
+ protected:
+ using SeqKMerVector = KMerVector<Seq>;
+ using KMerBuffer = std::vector<SeqKMerVector>;
+
+ std::vector<KMerBuffer> kmer_buffers_;
+ size_t cell_size_;
+ size_t num_files_;
+
+ path::files_t PrepareBuffers(size_t num_files, unsigned nthreads, size_t reads_buffer_size) {
+ num_files_ = num_files;
+
+ // Determine the set of output files
+ path::files_t out;
+ for (unsigned i = 0; i < num_files_; ++i)
+ out.push_back(this->GetRawKMersFname(i));
+
+ size_t file_limit = num_files_ + 2*nthreads;
+ size_t res = limit_file(file_limit);
+ if (res < file_limit) {
+ WARN("Failed to setup necessary limit for number of open files. The process might crash later on.");
+ WARN("Do 'ulimit -n " << file_limit << "' in the console to overcome the limit");
+ }
+
+ if (reads_buffer_size == 0) {
+ reads_buffer_size = 536870912ull;
+ size_t mem_limit = (size_t)((double)(get_free_memory()) / (nthreads * 3));
+ INFO("Memory available for splitting buffers: " << (double)mem_limit / 1024.0 / 1024.0 / 1024.0 << " Gb");
+ reads_buffer_size = std::min(reads_buffer_size, mem_limit);
+ }
+ cell_size_ = reads_buffer_size / (num_files_ * this->kmer_size());
+ // Set sane minimum cell size
+ if (cell_size_ < 16384)
+ cell_size_ = 16384;
+
+ INFO("Using cell size of " << cell_size_);
+ kmer_buffers_.resize(nthreads);
+ for (unsigned i = 0; i < nthreads; ++i) {
+ KMerBuffer &entry = kmer_buffers_[i];
+ entry.resize(num_files_, KMerVector<Seq>(this->K_, (size_t) (1.1 * (double) cell_size_)));
+ }
+
+ return out;
+ }
+
+ bool push_back_internal(const Seq &seq, unsigned thread_id) {
+ KMerBuffer &entry = kmer_buffers_[thread_id];
+
+ size_t idx = this->GetFileNumForSeq(seq, (unsigned)num_files_);
+ entry[idx].push_back(seq);
+ return entry[idx].size() > cell_size_;
+ }
+
+ void DumpBuffers(const path::files_t &ostreams) {
+ VERIFY(ostreams.size() == num_files_ && kmer_buffers_[0].size() == num_files_);
+
+# pragma omp parallel for
+ for (unsigned k = 0; k < num_files_; ++k) {
+ // Below k is thread id!
+
+ size_t sz = 0;
+ for (size_t i = 0; i < kmer_buffers_.size(); ++i)
+ sz += kmer_buffers_[i][k].size();
+
+ KMerVector<Seq> SortBuffer(this->K_, sz);
+ for (auto & entry : kmer_buffers_) {
+ const auto &buffer = entry[k];
+ for (size_t j = 0; j < buffer.size(); ++j)
+ SortBuffer.push_back(buffer[j]);
+ }
+ libcxx::sort(SortBuffer.begin(), SortBuffer.end(), typename KMerVector<Seq>::less2_fast());
+ auto it = std::unique(SortBuffer.begin(), SortBuffer.end(), typename KMerVector<Seq>::equal_to());
+
+# pragma omp critical
+ {
+ FILE *f = fopen(ostreams[k].c_str(), "ab");
+ VERIFY_MSG(f, "Cannot open temporary file to write");
+ fwrite(SortBuffer.data(), SortBuffer.el_data_size(), it - SortBuffer.begin(), f);
+ fclose(f);
+ }
+ }
+
+ for (auto & entry : kmer_buffers_)
+ for (auto & eentry : entry)
+ eentry.clear();
+ }
+
std::string GetRawKMersFname(unsigned suffix) const {
- return path::append_path(work_dir_, "kmers.raw." + std::to_string(suffix));
+ return path::append_path(this->work_dir_, "kmers.raw." + std::to_string(suffix));
}
unsigned GetFileNumForSeq(const Seq &s, unsigned total) const {
- return (unsigned)(hash_(s, seed_) % total);
+ return (unsigned)(this->hash_(s, this->seed_) % total);
}
- DECL_LOGGER("K-mer Splitting");
};
template<class Seq, class traits = kmer_index_traits<Seq> >
@@ -269,19 +180,14 @@ class KMerCounter {
typedef typename traits::RawKMerStorage RawKMerStorage;
typedef typename traits::FinalKMerStorage FinalKMerStorage;
- virtual size_t KMerSize() const = 0;
+ virtual size_t kmer_size() const = 0;
virtual size_t Count(unsigned num_buckets, unsigned num_threads) = 0;
virtual size_t CountAll(unsigned num_buckets, unsigned num_threads, bool merge = true) = 0;
virtual void MergeBuckets(unsigned num_buckets) = 0;
- virtual void OpenBucket(size_t idx, bool unlink = true) = 0;
- virtual void ReleaseBucket(size_t idx) = 0;
- virtual RawKMerStorage* TransferBucket(size_t idx) = 0;
- virtual FinalKMerStorage* GetFinalKMers() = 0;
-
- virtual iterator bucket_begin(size_t idx) = 0;
- virtual iterator bucket_end(size_t idx) = 0;
+ virtual std::unique_ptr<RawKMerStorage> GetBucket(size_t idx, bool unlink = true) = 0;
+ virtual std::unique_ptr<FinalKMerStorage> GetFinalKMers() = 0;
virtual ~KMerCounter() {}
@@ -291,7 +197,8 @@ protected:
template<class Seq, class traits = kmer_index_traits<Seq> >
class KMerDiskCounter : public KMerCounter<Seq> {
- typedef KMerCounter<Seq, traits> __super;
+ typedef KMerCounter<Seq, traits> __super;
+ typedef typename traits::RawKMerStorage BucketStorage;
public:
KMerDiskCounter(const std::string &work_dir, KMerSplitter<Seq> &splitter)
: work_dir_(work_dir), splitter_(splitter) {
@@ -303,43 +210,20 @@ public:
}
~KMerDiskCounter() {
- for (size_t i = 0; i < buckets_.size(); ++i)
- ReleaseBucket(i);
-
::close(fd_);
::unlink(kmer_prefix_.c_str());
}
- size_t KMerSize() const {
+ size_t kmer_size() const override {
return Seq::GetDataSize(splitter_.K()) * sizeof(typename Seq::DataType);
}
- void OpenBucket(size_t idx, bool unlink = true) {
+ std::unique_ptr<BucketStorage> GetBucket(size_t idx, bool unlink = true) override {
unsigned K = splitter_.K();
-
- buckets_[idx] = new MMappedRecordArrayReader<typename Seq::DataType>(GetMergedKMersFname((unsigned)idx), Seq::GetDataSize(K), unlink);
- }
-
- void ReleaseBucket(size_t idx) {
- delete buckets_[idx];
- buckets_[idx] = NULL;
- }
-
- MMappedRecordArrayReader<typename Seq::DataType>* TransferBucket(size_t idx) {
- MMappedRecordArrayReader<typename Seq::DataType> *res = buckets_[idx];
- buckets_[idx] = NULL;
-
- return res;
+ return std::unique_ptr<BucketStorage>(new BucketStorage(GetMergedKMersFname((unsigned)idx), Seq::GetDataSize(K), unlink));
}
- typename __super::iterator bucket_begin(size_t idx) {
- return buckets_[idx]->begin();
- }
- typename __super::iterator bucket_end(size_t idx) {
- return buckets_[idx]->end();
- }
-
- size_t Count(unsigned num_buckets, unsigned num_threads) {
+ size_t Count(unsigned num_buckets, unsigned num_threads) override {
unsigned K = splitter_.K();
// Split k-mers into buckets.
@@ -358,36 +242,30 @@ public:
std::string ofname = GetMergedKMersFname(i);
std::ofstream ofs(ofname.c_str(), std::ios::out | std::ios::binary);
for (unsigned j = 0; j < num_threads; ++j) {
- MMappedRecordArrayReader<typename Seq::DataType> ins(GetUniqueKMersFname(i + j * num_buckets), Seq::GetDataSize(K), /* unlink */ true);
+ BucketStorage ins(GetUniqueKMersFname(i + j * num_buckets), Seq::GetDataSize(K), /* unlink */ true);
ofs.write((const char*)ins.data(), ins.data_size());
}
}
- buckets_.resize(num_buckets);
-
return kmers;
}
- void MergeBuckets(unsigned num_buckets) {
+ void MergeBuckets(unsigned num_buckets) override {
unsigned K = splitter_.K();
INFO("Merging final buckets.");
- for (unsigned i = 0; i < num_buckets; ++i)
- VERIFY(buckets_[i] == NULL);
-
- buckets_.clear();
MMappedRecordArrayWriter<typename Seq::DataType> os(GetFinalKMersFname(), Seq::GetDataSize(K));
std::string ofname = GetFinalKMersFname();
std::ofstream ofs(ofname.c_str(), std::ios::out | std::ios::binary);
for (unsigned j = 0; j < num_buckets; ++j) {
- MMappedRecordArrayReader<typename Seq::DataType> ins(GetMergedKMersFname(j), Seq::GetDataSize(K), /* unlink */ true);
- ofs.write((const char*)ins.data(), ins.data_size());
+ auto bucket = GetBucket(j, /* unlink */ true);
+ ofs.write((const char*)bucket->data(), bucket->data_size());
}
ofs.close();
}
- size_t CountAll(unsigned num_buckets, unsigned num_threads, bool merge = true) {
+ size_t CountAll(unsigned num_buckets, unsigned num_threads, bool merge = true) override {
size_t kmers = Count(num_buckets, num_threads);
if (merge)
MergeBuckets(num_buckets);
@@ -395,9 +273,9 @@ public:
return kmers;
}
- typename __super::FinalKMerStorage *GetFinalKMers() {
+ std::unique_ptr<typename __super::FinalKMerStorage> GetFinalKMers() override {
unsigned K = splitter_.K();
- return new MMappedRecordArrayReader<typename Seq::DataType>(GetFinalKMersFname(), Seq::GetDataSize(K), /* unlink */ true);
+ return std::unique_ptr<typename __super::FinalKMerStorage>(new typename __super::FinalKMerStorage(GetFinalKMersFname(), Seq::GetDataSize(K), /* unlink */ true));
}
std::string GetMergedKMersFname(unsigned suffix) const {
@@ -414,8 +292,6 @@ private:
int fd_;
std::string kmer_prefix_;
- std::vector<MMappedRecordArrayReader<typename Seq::DataType>*> buckets_;
-
std::string GetUniqueKMersFname(unsigned suffix) const {
return kmer_prefix_ + ".unique." + std::to_string(suffix);
}
@@ -485,7 +361,7 @@ size_t KMerIndexBuilder<Index>::BuildIndex(Index &index, KMerCounter<Seq> &count
size_t clen = sizeof(cmem);
je_mallctl("stats.cactive", &cmem, &clen, NULL, 0);
- size_t bucket_size = (36 * kmers + kmers * counter.KMerSize()) / num_buckets_;
+ size_t bucket_size = (36 * kmers + kmers * counter.kmer_size()) / num_buckets_;
num_threads = std::min<unsigned>((unsigned) ((get_memory_limit() - *cmem) / bucket_size), num_threads);
if (num_threads < 1)
num_threads = 1;
@@ -496,24 +372,22 @@ size_t KMerIndexBuilder<Index>::BuildIndex(Index &index, KMerCounter<Seq> &count
# pragma omp parallel for shared(index) num_threads(num_threads)
for (unsigned iFile = 0; iFile < num_buckets_; ++iFile) {
typename KMerIndex<kmer_index_traits>::KMerDataIndex &data_index = index.index_[iFile];
- counter.OpenBucket(iFile, !save_final);
- size_t sz = counter.bucket_end(iFile) - counter.bucket_begin(iFile);
+ auto bucket = counter.GetBucket(iFile, !save_final);
+ size_t sz = bucket->end() - bucket->begin();
index.bucket_starts_[iFile + 1] = sz;
typename kmer_index_traits::KMerRawReferenceAdaptor adaptor;
size_t max_nodes = (size_t(std::ceil(double(sz) * 1.23)) + 2) / 3 * 3;
if (max_nodes >= uint64_t(1) << 32) {
emphf::hypergraph_sorter_seq<emphf::hypergraph<uint64_t> > sorter;
typename KMerIndex<kmer_index_traits>::KMerDataIndex(sorter,
- sz, emphf::range(counter.bucket_begin(iFile), counter.bucket_end(iFile)),
+ sz, emphf::range(bucket->begin(), bucket->end()),
adaptor).swap(data_index);
} else {
emphf::hypergraph_sorter_seq<emphf::hypergraph<uint32_t> > sorter;
typename KMerIndex<kmer_index_traits>::KMerDataIndex(sorter,
- sz, emphf::range(counter.bucket_begin(iFile), counter.bucket_end(iFile)),
+ sz, emphf::range(bucket->begin(), bucket->end()),
adaptor).swap(data_index);
}
-
- counter.ReleaseBucket(iFile);
}
// Finally, record the sizes of buckets.
diff --git a/src/modules/data_structures/mph_index/kmer_index_traits.hpp b/src/modules/data_structures/mph_index/kmer_index_traits.hpp
new file mode 100644
index 0000000..c9ef67b
--- /dev/null
+++ b/src/modules/data_structures/mph_index/kmer_index_traits.hpp
@@ -0,0 +1,87 @@
+#pragma once
+//***************************************************************************
+//* Copyright (c) 2016 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "io/kmers_io/mmapped_reader.hpp"
+#include "mphf.hpp"
+
+template<class Seq>
+struct kmer_index_traits {
+ typedef Seq SeqType;
+ typedef MMappedRecordArrayReader<typename Seq::DataType> RawKMerStorage;
+ typedef MMappedRecordArrayReader<typename Seq::DataType> FinalKMerStorage;
+ typedef typename RawKMerStorage::iterator raw_data_iterator;
+ typedef typename RawKMerStorage::const_iterator raw_data_const_iterator;
+ typedef typename RawKMerStorage::iterator::value_type KMerRawData;
+ typedef typename RawKMerStorage::iterator::reference KMerRawReference;
+ typedef typename RawKMerStorage::const_iterator::reference KMerRawConstReference;
+
+ struct raw_equal_to {
+ bool operator()(const Seq &lhs, const KMerRawReference rhs) {
+ return (array_equal_to<typename Seq::DataType>()(lhs.data(), lhs.data_size(), rhs));
+ }
+ };
+
+ struct raw_create {
+ Seq operator()(unsigned K, const KMerRawReference kmer) {
+ return Seq(K, kmer.data());
+ }
+ Seq operator()(unsigned K, const KMerRawConstReference kmer) {
+ return Seq(K, kmer.data());
+ }
+ };
+
+ struct hash_function {
+ uint64_t operator()(const Seq &k) const{
+ return typename Seq::hash()(k);
+ }
+ uint64_t operator()(const KMerRawReference k) const {
+ return typename Seq::hash()(k.data(), k.size());
+ }
+ };
+
+ struct KMerRawReferenceAdaptor {
+ emphf::byte_range_t operator()(const KMerRawReference k) const {
+ const uint8_t * data = (const uint8_t*)k.data();
+ return std::make_pair(data, data + k.data_size());
+ }
+ };
+
+ struct KMerSeqAdaptor {
+ emphf::byte_range_t operator()(const Seq &k) const {
+ const uint8_t * data = (const uint8_t*)k.data();
+ return std::make_pair(data, data + k.data_size() * sizeof(typename Seq::DataType));
+ }
+ };
+
+ template<class Writer>
+ static void raw_serialize(Writer &writer, RawKMerStorage *data) {
+ size_t sz = data->data_size(), elcnt = data->elcnt();
+ unsigned PageSize = getpagesize();
+ writer.write((char*)&sz, sizeof(sz));
+ writer.write((char*)&elcnt, sizeof(elcnt));
+ // Make sure data is aligned to the page boundary
+ size_t cpos = writer.tellp();
+ size_t pos = (cpos + PageSize - 1 + sizeof(size_t)) / PageSize * PageSize;
+ size_t off = pos - writer.tellp();
+ writer.write((char*)&off, sizeof(off));
+ writer.seekp(pos);
+ writer.write((char*)data->data(), data->data_size());
+ }
+
+ template<class Reader>
+ static std::unique_ptr<RawKMerStorage> raw_deserialize(Reader &reader, const std::string &FileName) {
+ size_t sz, off, elcnt;
+ reader.read((char*)&sz, sizeof(sz));
+ reader.read((char*)&elcnt, sizeof(elcnt));
+ reader.read((char*)&off, sizeof(off));
+ off -= sizeof(off);
+ off += reader.tellg();
+
+ return std::unique_ptr<RawKMerStorage>(new RawKMerStorage(FileName, elcnt, false, off, sz));
+ }
+
+};
diff --git a/src/modules/data_structures/mph_index/mphf.hpp b/src/modules/data_structures/mph_index/mphf.hpp
index a00c6fd..6c364ca 100644
--- a/src/modules/data_structures/mph_index/mphf.hpp
+++ b/src/modules/data_structures/mph_index/mphf.hpp
@@ -87,7 +87,7 @@ namespace emphf {
}
template <typename T, typename Adaptor>
- uint64_t lookup(T val, Adaptor adaptor)
+ uint64_t lookup(const T &val, Adaptor adaptor)
{
using std::get;
auto hashes = m_hasher(adaptor(val));
diff --git a/src/modules/data_structures/sequence/rtseq.hpp b/src/modules/data_structures/sequence/rtseq.hpp
index e67e855..ea1e279 100644
--- a/src/modules/data_structures/sequence/rtseq.hpp
+++ b/src/modules/data_structures/sequence/rtseq.hpp
@@ -290,6 +290,9 @@ public:
// which symbols does our string contain : 0123 or ACGT?
bool digit_str = size_ == 0 || is_dignucl(s[0]);
+ // we fill everything with zeros (As) by default.
+ std::fill(data_.begin(), data_.end(), 0);
+
// data -- one temporary variable corresponding to the i-th array element
// and some counters
T data = 0;
@@ -586,17 +589,18 @@ public:
bool operator==(const RuntimeSeq<max_size_, T> &s) const {
VERIFY(size_ == s.size_);
- // INFO(this->full_str());
- // INFO(s.full_str());
- return 0 == memcmp(data_.data(), s.data_.data(), sizeof(T) * DataSize);
+
+ size_t data_size = GetDataSize(size_);
+ for (size_t i = 0; i < data_size; ++i)
+ if (data_[i] != s.data_[i])
+ return false;
+
+ return true;
}
/**
* @see operator ==()
*/
-
-
-
bool operator!=(const RuntimeSeq<max_size_, T> &s) const {
return !operator==(s);
}
diff --git a/src/modules/data_structures/sequence/sequence.hpp b/src/modules/data_structures/sequence/sequence.hpp
index cf9304f..b25d217 100755
--- a/src/modules/data_structures/sequence/sequence.hpp
+++ b/src/modules/data_structures/sequence/sequence.hpp
@@ -262,6 +262,17 @@ public:
return size_;
}
+ template<class Seq>
+ bool contains(const Seq& s, size_t offset = 0) const {
+ VERIFY(offset + s.size() <= size());
+
+ for (size_t i = 0, e = s.size(); i != e; ++i)
+ if (operator[](offset + i) != s[i])
+ return false;
+
+ return true;
+ }
+
private:
inline bool ReadHeader(std::istream &file);
diff --git a/src/modules/data_structures/sequence/simple_seq.hpp b/src/modules/data_structures/sequence/simple_seq.hpp
index 8c5642f..77d0fe3 100644
--- a/src/modules/data_structures/sequence/simple_seq.hpp
+++ b/src/modules/data_structures/sequence/simple_seq.hpp
@@ -127,7 +127,10 @@ public:
struct equal_to {
bool operator()(const SimpleSeq<size_, T>& l, const SimpleSeq<size_, T>& r) const {
- return memcmp(l.data_.data(), r.data_.data(), sizeof(T) * DataSize) == 0;
+ for (size_t i = 0; i < DataSize; ++i)
+ if (l.data_[i] != r.data_[i])
+ return false;
+ return true;
}
};
diff --git a/src/modules/io/dataset_support/read_converter.hpp b/src/modules/io/dataset_support/read_converter.hpp
index 736c793..1182e7e 100644
--- a/src/modules/io/dataset_support/read_converter.hpp
+++ b/src/modules/io/dataset_support/read_converter.hpp
@@ -23,148 +23,140 @@
namespace debruijn_graph {
+typedef io::SequencingLibrary<config::DataSetData> SequencingLibrary;
+
class ReadConverter {
private:
- const static size_t current_binary_format_version = 10;
-
- void convert_reads_to_binary() {
- if (path::FileExists(cfg::get().temp_bin_reads_info)) {
- std::ifstream info;
- info.open(cfg::get().temp_bin_reads_info.c_str(), std::ios_base::in);
-
- size_t thread_num = 0;
- size_t format = 0;
- size_t lib_count = 0;
-
- info >> format;
- if (!info.eof()) {
- info >> thread_num;
- }
- if (!info.eof()) {
- info >> lib_count;
- }
-
- if (thread_num == cfg::get().max_threads && format == current_binary_format_version && lib_count == cfg::get().ds.reads.lib_count()) {
- INFO("Binary reads detected");
-
- io::ReadStreamStat stat;
- info >> stat.read_count_;
- info >> stat.max_len_;
- info >> stat.total_len_;
-
- auto &dataset = cfg::get_writable().ds.reads;
- for (size_t i = 0; i < dataset.lib_count(); ++i) {
- info >> dataset[i].data().binary_coverted;
- info >> dataset[i].data().read_length;
- info >> dataset[i].data().total_nucls;
-
- dataset[i].data().thread_num = cfg::get().max_threads;
- dataset[i].data().paired_read_prefix = cfg::get().paired_read_prefix + "_" + ToString(i);
- dataset[i].data().single_read_prefix = cfg::get().single_read_prefix + "_" + ToString(i);
- }
- info.close();
- return;
- }
- info.close();
+ const static size_t current_binary_format_version = 11;
+
+ static bool LoadLibIfExists(SequencingLibrary& lib) {
+ auto& data = lib.data();
+
+ if (!path::FileExists(data.binary_reads_info.bin_reads_info_file))
+ return false;
+
+ std::ifstream info;
+ info.open(data.binary_reads_info.bin_reads_info_file.c_str(), std::ios_base::in);
+ DEBUG("Reading binary information file " << data.binary_reads_info.bin_reads_info_file);
+
+ size_t chunk_num = 0;
+ size_t format = 0;
+ size_t lib_index = 0;
+
+ info >> format;
+ if (!info.eof()) {
+ info >> chunk_num;
+ }
+ if (!info.eof()) {
+ info >> lib_index;
}
+ if (chunk_num != data.binary_reads_info.chunk_num ||
+ format != current_binary_format_version ||
+ lib_index != data.lib_index) {
+ return false;
+ }
+
+ INFO("Binary reads detected");
+ info >> data.read_length;
+ info >> data.read_count;
+ info >> data.total_nucls;
+ data.binary_reads_info.binary_coverted = true;
+
+ info.close();
+ return true;
+ }
+
+ static void ConvertToBinary(SequencingLibrary& lib) {
+ auto& data = lib.data();
std::ofstream info;
- info.open(cfg::get().temp_bin_reads_info.c_str(), std::ios_base::out);
- info << "0 0";
+ info.open(data.binary_reads_info.bin_reads_info_file.c_str(), std::ios_base::out);
+ info << "0 0 0";
info.close();
- io::ReadStreamStat total_stat;
- auto& dataset = cfg::get_writable().ds.reads;
-
- INFO("Converting reads to binary format (takes a while)");
- for (size_t i = 0; i < dataset.lib_count(); ++i) {
- if (cfg::get().bwa.bwa_enable && dataset[i].is_bwa_alignable()) {
- INFO("Library #" << i << " will be used by BWA only and thus will not be converted");
- continue;
- }
- else if (dataset[i].is_binary_covertable()) {
- INFO("Paired reads for library #" << i);
- dataset[i].data().thread_num = cfg::get().max_threads;
- dataset[i].data().paired_read_prefix = cfg::get().paired_read_prefix + "_" + ToString(i);
-
- io::PairedStreamPtr paired_reader = paired_easy_reader(dataset[i], false, 0, false, false);
- io::BinaryWriter paired_converter
- (dataset[i].data().paired_read_prefix, cfg::get().max_threads, cfg::get().buffer_size);
- io::ReadStreamStat paired_stat = paired_converter.ToBinary(*paired_reader, dataset[i].orientation());
- paired_stat.read_count_ *= 2;
- total_stat.merge(paired_stat);
-
- INFO("Single reads for library #" << i);
- dataset[i].data().single_read_prefix = cfg::get().single_read_prefix + "_" + ToString(i);
- io::SingleStreamPtr single_reader = single_easy_reader(dataset[i], false, false);
- io::BinaryWriter single_converter
- (dataset[i].data().single_read_prefix, cfg::get().max_threads, cfg::get().buffer_size);
- io::ReadStreamStat single_stat = single_converter.ToBinary(*single_reader);
- total_stat.merge(single_stat);
-
- paired_stat.merge(single_stat);
- dataset[i].data().read_length = paired_stat.max_len_;
- dataset[i].data().total_nucls = paired_stat.total_len_;
- dataset[i].data().binary_coverted = true;
- }
- else {
- INFO("Library #" << i << " doesn't need to be converted");
- }
- }
- info.open(cfg::get().temp_bin_reads_info.c_str(), std::ios_base::out);
- info << current_binary_format_version << " " << cfg::get().max_threads << " " << cfg::get().ds.reads.lib_count() << " " <<
- total_stat.read_count_ << " " << total_stat.max_len_ << " " << total_stat.total_len_ << "\n";
-
- for (size_t i = 0; i < dataset.lib_count(); ++i) {
- info << dataset[i].data().binary_coverted
- << " " << dataset[i].data().read_length
- << " " << dataset[i].data().total_nucls << "\n";
- }
+ INFO("Converting reads to binary format for library #" << data.lib_index << " (takes a while)");
+ INFO("Converting paired reads");
+ io::PairedStreamPtr paired_reader = paired_easy_reader(lib, false, 0, false, false);
+ io::BinaryWriter paired_converter(data.binary_reads_info.paired_read_prefix,
+ data.binary_reads_info.chunk_num,
+ data.binary_reads_info.buffer_size);
+
+ io::ReadStreamStat paired_stat = paired_converter.ToBinary(*paired_reader, lib.orientation());
+ paired_stat.read_count_ *= 2;
+
+ INFO("Converting single reads");
+
+ io::SingleStreamPtr single_reader = single_easy_reader(lib, false, false);
+ io::BinaryWriter single_converter(data.binary_reads_info.single_read_prefix,
+ data.binary_reads_info.chunk_num,
+ data.binary_reads_info.buffer_size);
+ io::ReadStreamStat single_stat = single_converter.ToBinary(*single_reader);
+
+ paired_stat.merge(single_stat);
+ data.read_length = paired_stat.max_len_;
+ data.read_count = paired_stat.read_count_;
+ data.total_nucls = paired_stat.total_len_;
+
+ info.open(data.binary_reads_info.bin_reads_info_file.c_str(), std::ios_base::out);
+ info << current_binary_format_version << " " <<
+ data.binary_reads_info.chunk_num << " " <<
+ data.lib_index << " " <<
+ data.read_length << " " <<
+ data.read_count << " " <<
+ data.total_nucls << "\n";
+
info.close();
+ data.binary_reads_info.binary_coverted = true;
}
public:
- ReadConverter() {
- convert_reads_to_binary();
+ static void ConvertToBinaryIfNeeded(SequencingLibrary& lib) {
+ if (lib.data().binary_reads_info.binary_coverted)
+ return;
+
+ if (LoadLibIfExists(lib)) {
+ return;
+ }
+
+ ConvertToBinary(lib);
}
};
inline
-void convert_if_needed() {
- static ReadConverter converter;
-}
-
-inline
-io::BinaryPairedStreams raw_paired_binary_readers(const io::SequencingLibrary<config::DataSetData> &lib,
- bool followed_by_rc,
- size_t insert_size = 0) {
- convert_if_needed();
- VERIFY_MSG(lib.data().binary_coverted, "Lib was not converted to binary, cannot produce binary stream");
+io::BinaryPairedStreams raw_paired_binary_readers(io::SequencingLibrary<config::DataSetData> &lib,
+ bool followed_by_rc,
+ size_t insert_size = 0) {
+ ReadConverter::ConvertToBinaryIfNeeded(lib);
+ const auto& data = lib.data();
+ VERIFY_MSG(data.binary_reads_info.binary_coverted, "Lib was not converted to binary, cannot produce binary stream");
io::ReadStreamList<io::PairedReadSeq> paired_streams;
- for (size_t i = 0; i < lib.data().thread_num; ++i) {
- paired_streams.push_back(make_shared<io::BinaryFilePairedStream>(lib.data().paired_read_prefix, i, insert_size));
+ for (size_t i = 0; i < data.binary_reads_info.chunk_num; ++i) {
+ paired_streams.push_back(make_shared<io::BinaryFilePairedStream>(data.binary_reads_info.paired_read_prefix,
+ i, insert_size));
}
return io::apply_paired_wrappers(followed_by_rc, paired_streams);
}
inline
-io::BinarySingleStreams raw_single_binary_readers(const io::SequencingLibrary<config::DataSetData> &lib,
- bool followed_by_rc,
- bool including_paired_reads) {
- convert_if_needed();
- VERIFY_MSG(lib.data().binary_coverted, "Lib was not converted to binary, cannot produce binary stream");
+io::BinarySingleStreams raw_single_binary_readers(io::SequencingLibrary<config::DataSetData> &lib,
+ bool followed_by_rc,
+ bool including_paired_reads) {
+ const auto& data = lib.data();
+ ReadConverter::ConvertToBinaryIfNeeded(lib);
+ VERIFY_MSG(data.binary_reads_info.binary_coverted, "Lib was not converted to binary, cannot produce binary stream");
io::BinarySingleStreams single_streams;
- for (size_t i = 0; i < lib.data().thread_num; ++i) {
- single_streams.push_back(make_shared<io::BinaryFileSingleStream>(lib.data().single_read_prefix, i));
+ for (size_t i = 0; i < data.binary_reads_info.chunk_num; ++i) {
+ single_streams.push_back(make_shared<io::BinaryFileSingleStream>(data.binary_reads_info.single_read_prefix, i));
}
if (including_paired_reads) {
io::BinaryPairedStreams paired_streams;
- for (size_t i = 0; i < lib.data().thread_num; ++i) {
- paired_streams.push_back(make_shared<io::BinaryFilePairedStream>(lib.data().paired_read_prefix, i, 0));
+ for (size_t i = 0; i < data.binary_reads_info.chunk_num; ++i) {
+ paired_streams.push_back(make_shared<io::BinaryFilePairedStream>(data.binary_reads_info.paired_read_prefix,
+ i, 0));
}
return io::apply_single_wrappers(followed_by_rc, single_streams, &paired_streams);
@@ -176,185 +168,106 @@ io::BinarySingleStreams raw_single_binary_readers(const io::SequencingLibrary<co
inline
-io::BinaryPairedStreams paired_binary_readers(const io::SequencingLibrary<config::DataSetData> &lib,
- bool followed_by_rc,
- size_t insert_size = 0) {
- convert_if_needed();
+io::BinaryPairedStreams paired_binary_readers(io::SequencingLibrary<config::DataSetData> &lib,
+ bool followed_by_rc,
+ size_t insert_size = 0) {
return raw_paired_binary_readers(lib, followed_by_rc, insert_size);
}
inline
-io::BinarySingleStreams single_binary_readers(const io::SequencingLibrary<config::DataSetData> &lib,
- bool followed_by_rc,
- bool including_paired_reads) {
- convert_if_needed();
+io::BinarySingleStreams single_binary_readers(io::SequencingLibrary<config::DataSetData> &lib,
+ bool followed_by_rc,
+ bool including_paired_reads) {
return raw_single_binary_readers(lib, followed_by_rc, including_paired_reads);
}
inline
//todo simplify
-io::BinaryPairedStreams paired_binary_readers_for_libs(const std::vector<size_t>& libs,
- bool followed_by_rc,
- size_t insert_size = 0) {
- convert_if_needed();
+io::BinaryPairedStreams paired_binary_readers_for_libs(config::dataset& dataset_info,
+ const std::vector<size_t>& libs,
+ bool followed_by_rc,
+ size_t insert_size = 0) {
- std::vector<io::BinaryPairedStreams> streams(cfg::get().max_threads);
- for (size_t i = 0; i < libs.size(); ++i) {
- io::BinaryPairedStreams lib_streams = raw_paired_binary_readers(cfg::get().ds.reads[libs[i]], followed_by_rc, insert_size);
+ VERIFY(!libs.empty())
+ size_t chunk_num = dataset_info.reads[libs.front()].data().binary_reads_info.chunk_num;
- for (size_t j = 0; j < cfg::get().max_threads; ++j) {
- streams[j].push_back(lib_streams.ptr_at(j));
- }
+ std::vector<io::BinaryPairedStreams> streams(chunk_num);
+ for (size_t i = 0; i < libs.size(); ++i) {
+ VERIFY_MSG(chunk_num == dataset_info.reads[libs[i]].data().binary_reads_info.chunk_num,
+ "Cannot create stream for multiple libraries with different chunk_num")
+ io::BinaryPairedStreams lib_streams = raw_paired_binary_readers(dataset_info.reads[libs[i]], followed_by_rc, insert_size);
+ for (size_t j = 0; j < chunk_num; ++j) {
+ streams[j].push_back(lib_streams.ptr_at(j));
+ }
}
io::BinaryPairedStreams joint_streams;
- for (size_t j = 0; j < cfg::get().max_threads; ++j) {
+ for (size_t j = 0; j < chunk_num; ++j) {
joint_streams.push_back(io::MultifileWrap<io::PairedReadSeq>(streams[j]));
}
return joint_streams;
}
inline
-io::BinarySingleStreams single_binary_readers_for_libs(const std::vector<size_t>& libs,
- bool followed_by_rc,
- bool including_paired_reads) {
- convert_if_needed();
-
- std::vector<io::BinarySingleStreams> streams(cfg::get().max_threads);
+io::BinarySingleStreams single_binary_readers_for_libs(config::dataset& dataset_info,
+ const std::vector<size_t>& libs,
+ bool followed_by_rc,
+ bool including_paired_reads) {
+ VERIFY(!libs.empty())
+ size_t chunk_num = dataset_info.reads[libs.front()].data().binary_reads_info.chunk_num;
+
+ std::vector<io::BinarySingleStreams> streams(chunk_num);
for (size_t i = 0; i < libs.size(); ++i) {
- io::BinarySingleStreams lib_streams = raw_single_binary_readers(cfg::get().ds.reads[libs[i]], followed_by_rc, including_paired_reads);
+ VERIFY_MSG(chunk_num == dataset_info.reads[libs[i]].data().binary_reads_info.chunk_num,
+ "Cannot create stream for multiple libraries with different chunk_num")
+ io::BinarySingleStreams lib_streams = raw_single_binary_readers(dataset_info.reads[libs[i]], followed_by_rc, including_paired_reads);
- for (size_t j = 0; j < cfg::get().max_threads; ++j) {
- streams[j].push_back(lib_streams.ptr_at(j));
- }
+ for (size_t j = 0; j < chunk_num; ++j) {
+ streams[j].push_back(lib_streams.ptr_at(j));
+ }
}
io::BinarySingleStreams joint_streams;
- for (size_t j = 0; j < cfg::get().max_threads; ++j) {
+ for (size_t j = 0; j < chunk_num; ++j) {
joint_streams.push_back(io::MultifileWrap<io::SingleReadSeq>(streams[j]));
}
return joint_streams;
}
inline
-io::BinaryPairedStreams paired_binary_readers(bool followed_by_rc,
- size_t insert_size = 0) {
- std::vector<size_t> all_libs(cfg::get().ds.reads.lib_count());
- for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i) {
+io::BinaryPairedStreams paired_binary_readers(config::dataset& dataset_info,
+ bool followed_by_rc,
+ size_t insert_size = 0) {
+
+ std::vector<size_t> all_libs(dataset_info.reads.lib_count());
+ for (size_t i = 0; i < dataset_info.reads.lib_count(); ++i) {
all_libs[i] = i;
}
- return paired_binary_readers_for_libs(all_libs, followed_by_rc, insert_size);
+ return paired_binary_readers_for_libs(dataset_info, all_libs, followed_by_rc, insert_size);
}
inline
-io::BinarySingleStreams single_binary_readers(bool followed_by_rc,
- bool including_paired_reads) {
- std::vector<size_t> all_libs(cfg::get().ds.reads.lib_count());
- for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i) {
+io::BinarySingleStreams single_binary_readers(config::dataset& dataset_info,
+ bool followed_by_rc,
+ bool including_paired_reads) {
+ std::vector<size_t> all_libs(dataset_info.reads.lib_count());
+ for (size_t i = 0; i < dataset_info.reads.lib_count(); ++i) {
all_libs[i] = i;
}
- return single_binary_readers_for_libs(all_libs, followed_by_rc, including_paired_reads);
+ return single_binary_readers_for_libs(dataset_info, all_libs, followed_by_rc, including_paired_reads);
}
inline
-io::BinarySingleStreamPtr single_binary_multireader(bool followed_by_rc, bool including_paired_reads) {
- return io::MultifileWrap<io::SingleReadSeq>(single_binary_readers(followed_by_rc, including_paired_reads));
+io::BinarySingleStreamPtr single_binary_multireader(config::dataset& dataset_info, bool followed_by_rc, bool including_paired_reads) {
+ return io::MultifileWrap<io::SingleReadSeq>(single_binary_readers(dataset_info, followed_by_rc, including_paired_reads));
}
inline
-io::BinaryPairedStreamPtr paired_binary_multireader(bool followed_by_rc, size_t insert_size = 0) {
- return io::MultifileWrap<io::PairedReadSeq>(paired_binary_readers(followed_by_rc, insert_size));
+io::BinaryPairedStreamPtr paired_binary_multireader(config::dataset& dataset_info, bool followed_by_rc, size_t insert_size = 0) {
+ return io::MultifileWrap<io::PairedReadSeq>(paired_binary_readers(dataset_info, followed_by_rc, insert_size));
}
-/*
-
-class BufferedReadersStorage {
-
-private:
-
- std::vector< SequenceSingleReadStream* > * single_streams_;
-
- std::vector< SequencePairedReadStream* > * paired_streams_;
-
- BufferedReadersStorage() {
- INFO("Creating buffered read storage");
-
- INFO("Buffering single reads... (takes a while)");
- single_streams_ = new std::vector< SequenceSingleReadStream* >(cfg::get().max_threads);
- for (size_t i = 0; i < cfg::get().max_threads; ++i) {
- io::PredictableIReader<io::SingleReadSeq> * s_stream = new io::SeqSingleReadStream(cfg::get().single_read_prefix, i);
- single_streams_->at(i) = new io::ReadBufferedStream<io::SingleReadSeq> (*s_stream);
- }
-
- INFO("Buffering paired reads... (takes a while)");
- paired_streams_ = new std::vector< SequencePairedReadStream* >(cfg::get().max_threads);
- for (size_t i = 0; i < cfg::get().max_threads; ++i) {
- io::PredictableIReader<io::PairedReadSeq> * p_stream = new io::SeqPairedReadStream(cfg::get().paired_read_prefix, i, 0);
- paired_streams_->at(i) = new io::ReadBufferedStream<io::PairedReadSeq> (*p_stream);
- }
- }
-
- BufferedReadersStorage(const BufferedReadersStorage&);
-
- BufferedReadersStorage& operator=(const BufferedReadersStorage&);
-
-public:
-
- static BufferedReadersStorage * GetInstance() {
- static BufferedReadersStorage instance;
- return &instance;
- }
-
-
- std::vector< SequenceSingleReadStream* > * GetSingleReaders() const {
- return single_streams_;
- }
-
- std::vector< SequencePairedReadStream* > * GetPairedReaders() const {
- return paired_streams_;
- }
-
-};
-
-
-std::vector< SequenceSingleReadStream* > single_buffered_binary_readers(bool followed_by_rc, bool including_paired_reads) {
- convert_if_needed();
-
- BufferedReadersStorage * storage = BufferedReadersStorage::GetInstance();
-
- if (including_paired_reads) {
- return apply_single_wrappers(followed_by_rc, *(storage->GetSingleReaders()), storage->GetPairedReaders());
- }
- else {
- return apply_single_wrappers(followed_by_rc, *(storage->GetSingleReaders()));
- }
-}
-
-std::vector< SequencePairedReadStream* > paired_buffered_binary_readers(bool followed_by_rc, size_t insert_size) {
- convert_if_needed();
-
- BufferedReadersStorage * storage = BufferedReadersStorage::GetInstance();
-
- std::vector<SequencePairedReadStream*> paired_streams(cfg::get().max_threads);
- for (size_t i = 0; i < cfg::get().max_threads; ++i) {
- paired_streams[i] = new io::InsertSizeModifyingWrapper(*(storage->GetPairedReaders()->at(i)), insert_size);
- }
- return apply_paired_wrappers(followed_by_rc, paired_streams);
-}
-
-auto_ptr<SequenceSingleReadStream> single_buffered_binary_multireader(bool followed_by_rc, bool including_paired_reads) {
- convert_if_needed();
-
- return auto_ptr<SequenceSingleReadStream>(new io::MultifileReader<io::SingleReadSeq>(single_buffered_binary_readers(followed_by_rc, including_paired_reads)));
-}
-
-auto_ptr<SequencePairedReadStream> paired_buffered_binary_multireader(bool followed_by_rc, size_t insert_size) {
- convert_if_needed();
-
- return auto_ptr<SequencePairedReadStream>(new io::MultifileReader<io::PairedReadSeq>(paired_buffered_binary_readers(followed_by_rc, insert_size)));
-}
-*/
}
diff --git a/src/modules/io/reads_io/modifying_reader_wrapper.hpp b/src/modules/io/reads_io/modifying_reader_wrapper.hpp
index 8039db0..5575e92 100644
--- a/src/modules/io/reads_io/modifying_reader_wrapper.hpp
+++ b/src/modules/io/reads_io/modifying_reader_wrapper.hpp
@@ -9,6 +9,7 @@
#include "dev_support/verify.hpp"
#include "io/reads_io/delegating_reader_wrapper.hpp"
+#include "paired_readers.hpp"
#include <memory>
#include <io/reads/single_read.hpp>
diff --git a/src/modules/io/reads_io/mpmc_bounded.hpp b/src/modules/io/reads_io/mpmc_bounded.hpp
index d82ced5..5721c79 100644
--- a/src/modules/io/reads_io/mpmc_bounded.hpp
+++ b/src/modules/io/reads_io/mpmc_bounded.hpp
@@ -94,6 +94,31 @@ public:
return true;
}
+ bool enqueue(T &&data) {
+ if (is_closed())
+ return false;
+
+ cell_t *cell;
+ size_t pos = enqueue_pos_.load(std::memory_order_relaxed);
+ for (; ;) {
+ cell = &buffer_[pos & buffer_mask_];
+ size_t seq = cell->sequence_.load(std::memory_order_acquire);
+ intptr_t dif = (intptr_t) seq - (intptr_t) pos;
+ if (dif == 0) {
+ if (enqueue_pos_.compare_exchange_weak(pos, pos + 1, std::memory_order_relaxed))
+ break;
+ } else if (dif < 0)
+ return false;
+ else
+ pos = enqueue_pos_.load(std::memory_order_relaxed);
+ }
+
+ cell->data_ = std::move(data);
+ cell->sequence_.store(pos + 1, std::memory_order_release);
+
+ return true;
+ }
+
bool dequeue(T &data) {
cell_t *cell;
size_t pos = dequeue_pos_.load(std::memory_order_relaxed);
@@ -110,7 +135,7 @@ public:
pos = dequeue_pos_.load(std::memory_order_relaxed);
}
- data = cell->data_;
+ data = std::move(cell->data_);
cell->sequence_.store(pos + buffer_mask_ + 1, std::memory_order_release);
return true;
diff --git a/src/modules/io/reads_io/paired_readers.hpp b/src/modules/io/reads_io/paired_readers.hpp
index 78cc4ba..14e84a7 100644
--- a/src/modules/io/reads_io/paired_readers.hpp
+++ b/src/modules/io/reads_io/paired_readers.hpp
@@ -10,6 +10,7 @@
#include <string>
#include "ireader.hpp"
#include "io/reads/paired_read.hpp"
+#include "file_reader.hpp"
#include "orientation.hpp"
namespace io {
diff --git a/src/modules/io/reads_io/read_processor.hpp b/src/modules/io/reads_io/read_processor.hpp
index 2648852..1da18de 100644
--- a/src/modules/io/reads_io/read_processor.hpp
+++ b/src/modules/io/reads_io/read_processor.hpp
@@ -18,7 +18,7 @@
#endif
namespace hammer {
class ReadProcessor {
- static size_t const cacheline_size = 64;
+ static size_t constexpr cacheline_size = 64;
typedef char cacheline_pad_t[cacheline_size];
unsigned nthreads_;
@@ -31,13 +31,15 @@ class ReadProcessor {
private:
template<class Reader, class Op>
bool RunSingle(Reader &irs, Op &op) {
+ using ReadPtr = std::unique_ptr<typename Reader::ReadT>;
+
while (!irs.eof()) {
- typename Reader::ReadT r;
- irs >> r;
+ ReadPtr r = ReadPtr(new typename Reader::ReadT) ;
+ irs >> *r;
read_ += 1;
processed_ += 1;
- if (op(r))
+ if (op(std::move(r))) // Pass ownership of read down to processor
return true;
}
@@ -46,12 +48,14 @@ private:
template<class Reader, class Op, class Writer>
void RunSingle(Reader &irs, Op &op, Writer &writer) {
+ using ReadPtr = std::unique_ptr<typename Reader::ReadT>;
+
while (!irs.eof()) {
- typename Reader::ReadT r;
- irs >> r;
+ ReadPtr r = ReadPtr(new typename Reader::ReadT) ;
+ irs >> *r;
read_ += 1;
- auto res = op(r);
+ auto res = op(std::move(r)); // Pass ownership of read down to processor
processed_ += 1;
if (res)
@@ -69,6 +73,8 @@ public:
template<class Reader, class Op>
bool Run(Reader &irs, Op &op) {
+ using ReadPtr = std::unique_ptr<typename Reader::ReadT>;
+
if (nthreads_ < 2)
return RunSingle(irs, op);
@@ -81,7 +87,7 @@ public:
bufsize = (bufsize >> 16) | bufsize;
bufsize += 1;
- mpmc_bounded_queue<typename Reader::ReadT> in_queue(2 * bufsize);
+ mpmc_bounded_queue<ReadPtr> in_queue(2 * bufsize);
bool stop = false;
# pragma omp parallel shared(in_queue, irs, op, stop) num_threads(nthreads_)
@@ -89,12 +95,12 @@ public:
# pragma omp master
{
while (!irs.eof()) {
- typename Reader::ReadT r;
- irs >> r;
+ ReadPtr r = ReadPtr(new typename Reader::ReadT) ;
+ irs >> *r;
# pragma omp atomic
read_ += 1;
- while (!in_queue.enqueue(r))
+ while (!in_queue.enqueue(std::move(r)))
sched_yield();
# pragma omp flush (stop)
@@ -106,7 +112,7 @@ public:
}
while (1) {
- typename Reader::ReadT r;
+ ReadPtr r;
if (!in_queue.wait_dequeue(r))
break;
@@ -114,7 +120,7 @@ public:
# pragma omp atomic
processed_ += 1;
- bool res = op(r);
+ bool res = op(std::move(r));
if (res) {
# pragma omp atomic
stop |= res;
@@ -128,6 +134,8 @@ public:
template<class Reader, class Op, class Writer>
void Run(Reader &irs, Op &op, Writer &writer) {
+ using ReadPtr = std::unique_ptr<typename Reader::ReadT>;
+
if (nthreads_ < 2) {
RunSingle(irs, op, writer);
return;
@@ -142,55 +150,55 @@ public:
bufsize = (bufsize >> 16) | bufsize;
bufsize += 1;
- mpmc_bounded_queue<typename Reader::ReadT> in_queue(bufsize), out_queue(2 * bufsize);
+ mpmc_bounded_queue<ReadPtr> in_queue(bufsize), out_queue(2 * bufsize);
# pragma omp parallel shared(in_queue, out_queue, irs, op, writer) num_threads(nthreads_)
{
# pragma omp master
{
while (!irs.eof()) {
- typename Reader::ReadT r;
- irs >> r;
+ ReadPtr r = ReadPtr(new typename Reader::ReadT) ;
+ irs >> *r;
// First, try to provide read to the queue. If it's full, never mind.
- bool status = in_queue.enqueue(r);
+ bool status = in_queue.enqueue(std::move(r));
// Flush down the output queue
- typename Reader::ReadT outr;
+ ReadPtr outr;
while (out_queue.dequeue(outr))
- writer << outr;
+ writer << *outr;
// If the input queue was originally full, wait until we can insert
// the read once again.
if (!status)
- while (!in_queue.enqueue(r))
+ while (!in_queue.enqueue(std::move(r)))
sched_yield();
}
in_queue.close();
// Flush down the output queue while in master threads.
- typename Reader::ReadT outr;
+ ReadPtr outr;
while (out_queue.dequeue(outr))
- writer << outr;
+ writer << *outr;
}
while (1) {
- typename Reader::ReadT r;
+ ReadPtr r;
if (!in_queue.wait_dequeue(r))
break;
- auto res = op(r);
+ auto res = op(std::move(r));
if (res)
- while (!out_queue.enqueue(*res))
+ while (!out_queue.enqueue(std::move(res)))
sched_yield();
}
}
// Flush down the output queue
- typename Reader::ReadT outr;
+ ReadPtr outr;
while (out_queue.dequeue(outr))
- writer << outr;
+ writer << *outr;
}
};
diff --git a/src/modules/io/reads_io/splitting_wrapper.hpp b/src/modules/io/reads_io/splitting_wrapper.hpp
index 026dff2..95a4f23 100644
--- a/src/modules/io/reads_io/splitting_wrapper.hpp
+++ b/src/modules/io/reads_io/splitting_wrapper.hpp
@@ -7,6 +7,7 @@
#pragma once
#include "io/reads/single_read.hpp"
+#include "read_stream_vector.hpp"
#include "delegating_reader_wrapper.hpp"
namespace io {
diff --git a/src/modules/math/kmer_coverage_model.cpp b/src/modules/math/kmer_coverage_model.cpp
index c957546..db886d7 100644
--- a/src/modules/math/kmer_coverage_model.cpp
+++ b/src/modules/math/kmer_coverage_model.cpp
@@ -183,7 +183,7 @@ size_t KMerCoverageModel::EstimateValley() const {
}
void KMerCoverageModel::Fit() {
- VERIFY_MSG(cov_.size() > 10, "Invalid kmer coverage histogram");
+ VERIFY_MSG(cov_.size() > 10, "Invalid kmer coverage histogram, make sure that the coverage is indeed uniform");
// Find the minimal coverage point using smoothed histogram.
Valley_ = EstimateValley();
@@ -214,7 +214,7 @@ void KMerCoverageModel::Fit() {
}
if (MaxCov_ - Valley_ < 3)
- WARN("Too much erroneous kmers, the estimates might be unreliable");
+ WARN("Too many erroneous kmers, the estimates might be unreliable");
std::vector <size_t> mvals(1 + MaxCov_ - Valley_);
mvals[0] = cov_[MaxCov_];
diff --git a/src/modules/paired_info/pair_info_improver.hpp b/src/modules/paired_info/pair_info_improver.hpp
index f1b392a..ac6475c 100644
--- a/src/modules/paired_info/pair_info_improver.hpp
+++ b/src/modules/paired_info/pair_info_improver.hpp
@@ -110,11 +110,11 @@ class PairInfoImprover {
omnigraph::de::PairedInfoIndexT<Graph>& index, size_t max_repeat_length)
: to_remove_(to_remove), graph_(g), index_(index), max_repeat_length_(max_repeat_length) {}
- bool operator()(EdgeId e) {
+ bool operator()(std::unique_ptr<EdgeId> e) {
omnigraph::de::PairedInfoIndexT<Graph> &to_remove = to_remove_[omp_get_thread_num()];
- if (graph_.length(e)>= max_repeat_length_ && index_.contains(e))
- FindInconsistent(e, to_remove);
+ if (graph_.length(*e)>= max_repeat_length_ && index_.contains(*e))
+ FindInconsistent(*e, to_remove);
return false;
}
diff --git a/src/modules/pipeline/config_struct.cpp b/src/modules/pipeline/config_struct.cpp
index 4e35ffd..fecc73b 100644
--- a/src/modules/pipeline/config_struct.cpp
+++ b/src/modules/pipeline/config_struct.cpp
@@ -41,7 +41,7 @@ void SequencingLibrary<debruijn_graph::config::DataSetData>::yamlize(llvm::yaml:
io.mapOptional("insert size distribution" , data_.insert_size_distribution);
io.mapOptional("average coverage" , data_.average_coverage);
io.mapOptional("pi threshold" , data_.pi_threshold);
- io.mapOptional("binary converted" , data_.binary_coverted);
+ io.mapOptional("binary converted" , data_.binary_reads_info.binary_coverted);
io.mapOptional("single reads mapped" , data_.single_reads_mapped);
}
@@ -153,6 +153,14 @@ void load(debruijn_config::simplification::tip_clipper &tc,
load(tc.condition, pt, "condition");
}
+void load(debruijn_config::simplification::dead_end_clipper& dead_end,
+ boost::property_tree::ptree const &pt,
+ bool /* complete */) {
+ using config_common::load;
+ load(dead_end.condition, pt, "condition");
+ load(dead_end.enabled, pt, "enabled");
+}
+
void load(resolving_mode &rm, boost::property_tree::ptree const &pt,
std::string const &key, bool complete) {
if (complete || pt.find(key) != pt.not_found()) {
@@ -300,6 +308,15 @@ void load(debruijn_config::simplification::erroneous_connections_remover& ec,
load(ec.condition, pt, "condition");
}
+void load(debruijn_config::simplification::relative_coverage_ec_remover& rcec,
+ boost::property_tree::ptree const& pt, bool /*complete*/) {
+ using config_common::load;
+
+ load(rcec.enabled, pt, "enabled");
+ load(rcec.max_ec_length, pt, "rcec_lb");
+ load(rcec.rcec_ratio, pt, "rcec_cb");
+}
+
void load(debruijn_config::simplification::topology_based_ec_remover& tec,
boost::property_tree::ptree const& pt, bool /*complete*/) {
using config_common::load;
@@ -520,10 +537,13 @@ void load(debruijn_config::simplification& simp,
load(simp.post_simplif_enabled, pt, "post_simplif_enabled", complete);
load(simp.topology_simplif_enabled, pt, "topology_simplif_enabled", complete);
load(simp.tc, pt, "tc", complete); // tip clipper:
+
+ load(simp.dead_end, pt, "dead_end", complete); // dead end:
load(simp.ttc, pt, "ttc", complete); // topology tip clipper:
load(simp.complex_tc, pt, "complex_tc", complete); // complex tip clipper:
load(simp.br, pt, "br", complete); // bulge remover:
load(simp.ec, pt, "ec", complete); // erroneous connections remover:
+ load(simp.rcec, pt, "rcec", complete); // relative coverage erroneous connections remover
load(simp.rcc, pt, "rcc", complete); // relative coverage component remover:
load(simp.relative_ed, pt, "relative_ed", complete); // relative edge disconnector:
load(simp.tec, pt, "tec", complete); // topology aware erroneous connections remover:
@@ -555,6 +575,8 @@ void load(debruijn_config::info_printer& printer,
load(printer.write_components_along_contigs, pt,
"write_components_along_contigs", complete);
load(printer.save_full_graph, pt, "save_full_graph", complete);
+ load(printer.save_all, pt, "save_all", complete);
+ load(printer.save_graph_pack, pt, "save_graph_pack", complete);
load(printer.write_full_graph, pt, "write_full_graph", complete);
load(printer.write_full_nc_graph, pt, "write_full_nc_graph", complete);
load(printer.write_error_loc, pt, "write_error_loc", complete);
@@ -626,6 +648,8 @@ void load_launch_info(debruijn_config &cfg, boost::property_tree::ptree const &p
load(cfg.use_additional_contigs, pt, "use_additional_contigs");
load(cfg.additional_contigs, pt, "additional_contigs");
+ INFO("Additional contigs is " << cfg.additional_contigs);
+
load(cfg.rr_enable, pt, "rr_enable");
load(cfg.buffer_size, pt, "buffer_size");
@@ -672,6 +696,8 @@ void load_cfg(debruijn_config &cfg, boost::property_tree::ptree const &pt,
load(cfg.use_intermediate_contigs, pt, "use_intermediate_contigs", complete);
load(cfg.single_reads_rr, pt, "single_reads_rr", complete);
+ load(cfg.preserve_raw_paired_index, pt, "preserve_raw_paired_index", complete);
+
load(cfg.correct_mismatches, pt, "correct_mismatches", complete);
load(cfg.paired_info_statistics, pt, "paired_info_statistics", complete);
load(cfg.paired_info_scaffolder, pt, "paired_info_scaffolder", complete);
@@ -758,12 +784,12 @@ void load(debruijn_config &cfg, const std::vector<std::string> &cfg_fns) {
}
if (!cfg.use_scaffolder) {
- cfg.pe_params.param_set.scaffolder_options.on = false;
+ cfg.pe_params.param_set.scaffolder_options.enabled = false;
}
cfg.need_mapping = cfg.developer_mode || cfg.correct_mismatches
|| cfg.gap_closer_enable || cfg.rr_enable;
- cfg.output_dir = cfg.output_base + "/K" + ToString(cfg.K) + "/";
+ cfg.output_dir = cfg.output_base + "/K" + std::to_string(cfg.K) + "/";
cfg.output_saves = cfg.output_dir + "saves/";
@@ -776,10 +802,17 @@ void load(debruijn_config &cfg, const std::vector<std::string> &cfg_fns) {
(cfg.output_base + "/" + cfg.temp_bin_reads_dir) :
(cfg.output_base + cfg.project_name + "/"
+ cfg.temp_bin_reads_dir);
- cfg.temp_bin_reads_info = cfg.temp_bin_reads_path + "INFO";
-
- cfg.paired_read_prefix = cfg.temp_bin_reads_path + "_paired";
- cfg.single_read_prefix = cfg.temp_bin_reads_path + "_single";
+ //cfg.temp_bin_reads_info = cfg.temp_bin_reads_path + "INFO";
+
+ for (size_t i = 0; i < cfg.ds.reads.lib_count(); ++i) {
+ auto& lib = cfg.ds.reads[i];
+ lib.data().lib_index = i;
+ lib.data().binary_reads_info.chunk_num = cfg.max_threads;
+ lib.data().binary_reads_info.bin_reads_info_file = cfg.temp_bin_reads_path + "INFO_" + std::to_string(i);
+ lib.data().binary_reads_info.buffer_size = cfg.buffer_size;
+ lib.data().binary_reads_info.paired_read_prefix = cfg.temp_bin_reads_path + "paired_" + std::to_string(i);
+ lib.data().binary_reads_info.single_read_prefix = cfg.temp_bin_reads_path + "single_" + std::to_string(i);
+ }
}
}
diff --git a/src/modules/pipeline/config_struct.hpp b/src/modules/pipeline/config_struct.hpp
index b1cce24..70e4e3b 100644
--- a/src/modules/pipeline/config_struct.hpp
+++ b/src/modules/pipeline/config_struct.hpp
@@ -107,16 +107,25 @@ struct DataSetData {
double insert_size_mad;
std::map<int, size_t> insert_size_distribution;
- bool binary_coverted;
+ size_t lib_index;
bool single_reads_mapped;
-
uint64_t total_nucls;
+ size_t read_count;
+
double average_coverage;
double pi_threshold;
- std::string paired_read_prefix;
- std::string single_read_prefix;
- size_t thread_num;
+ struct BinaryReadsInfo {
+ BinaryReadsInfo(): binary_coverted(false), chunk_num(0), buffer_size(0) {}
+
+ bool binary_coverted;
+ std::string bin_reads_info_file;
+ std::string paired_read_prefix;
+ std::string single_read_prefix;
+ size_t chunk_num;
+ size_t buffer_size;
+ } binary_reads_info;
+
DataSetData(): read_length(0), avg_read_length(0.0),
mean_insert_size(0.0),
@@ -125,15 +134,18 @@ struct DataSetData {
insert_size_right_quantile(0.0),
median_insert_size(0.0),
insert_size_mad(0.0),
- binary_coverted(false),
+ lib_index(0),
single_reads_mapped(false),
total_nucls(0),
+ read_count(0),
average_coverage(0.0),
- pi_threshold(0.0) {
- }
+ pi_threshold(0.0),
+ binary_reads_info() {}
};
struct dataset {
+ typedef io::DataSet<DataSetData>::Library Library;
+
io::DataSet<DataSetData> reads;
size_t max_read_length;
@@ -178,6 +190,8 @@ struct debruijn_config {
bool developer_mode;
+ bool preserve_raw_paired_index;
+
struct simplification {
struct tip_clipper {
std::string condition;
@@ -185,6 +199,11 @@ struct debruijn_config {
tip_clipper(std::string condition_) : condition(condition_) {}
};
+ struct dead_end_clipper {
+ std::string condition;
+ bool enabled;
+ };
+
struct topology_tip_clipper {
double length_coeff;
size_t uniqueness_length;
@@ -221,9 +240,9 @@ struct debruijn_config {
};
struct relative_coverage_ec_remover {
- size_t max_ec_length_coefficient;
- double max_coverage_coeff;
- double coverage_gap;
+ bool enabled;
+ size_t max_ec_length;
+ double rcec_ratio;
};
struct topology_based_ec_remover {
@@ -301,10 +320,12 @@ struct debruijn_config {
bool post_simplif_enabled;
bool topology_simplif_enabled;
tip_clipper tc;
+ dead_end_clipper dead_end;
complex_tip_clipper complex_tc;
topology_tip_clipper ttc;
bulge_remover br;
erroneous_connections_remover ec;
+ relative_coverage_ec_remover rcec;
relative_coverage_comp_remover rcc;
relative_coverage_edge_disconnector relative_ed;
topology_based_ec_remover tec;
@@ -379,7 +400,6 @@ struct debruijn_config {
double small_component_relative_coverage;
size_t min_component_length;
size_t min_isolated_length;
-
};
struct pacbio_processor {
@@ -425,6 +445,8 @@ struct debruijn_config {
bool write_components_along_genome;
bool write_components_along_contigs;
bool save_full_graph;
+ bool save_all;
+ bool save_graph_pack;
bool write_error_loc;
bool write_full_graph;
bool write_full_nc_graph;
diff --git a/src/modules/pipeline/genomic_info_filler.cpp b/src/modules/pipeline/genomic_info_filler.cpp
index 8b71fa3..65a8eda 100644
--- a/src/modules/pipeline/genomic_info_filler.cpp
+++ b/src/modules/pipeline/genomic_info_filler.cpp
@@ -111,7 +111,7 @@ void GenomicInfoFiller::run(conj_graph_pack &gp, const char*) {
std::map<size_t, size_t> tmp;
size_t maxcov = 0;
size_t kmer_per_record = 1;
- if (conj_graph_pack::index_t::InnerIndexT::storing_type::IsInvertable())
+ if (conj_graph_pack::index_t::InnerIndex::storing_type::IsInvertable())
kmer_per_record = 2;
for (auto I = gp.index.inner_index().value_cbegin(), E = gp.index.inner_index().value_cend(); I != E; ++I) {
diff --git a/src/modules/pipeline/graph_pack.hpp b/src/modules/pipeline/graph_pack.hpp
index 9c997fd..e445ba0 100644
--- a/src/modules/pipeline/graph_pack.hpp
+++ b/src/modules/pipeline/graph_pack.hpp
@@ -26,15 +26,13 @@
namespace debruijn_graph {
-/*KmerFree*//*KmerStoring*/
-template<class Graph,
- class KmerEdgeIndex = KmerStoringEdgeIndex<Graph, runtime_k::RtSeq, kmer_index_traits<runtime_k::RtSeq>, DefaultStoring>>
+template<class Graph>
struct graph_pack: private boost::noncopyable {
typedef Graph graph_t;
typedef typename Graph::VertexId VertexId;
typedef typename Graph::EdgeId EdgeId;
typedef runtime_k::RtSeq seq_t;
- typedef EdgeIndex<graph_t, seq_t, KmerEdgeIndex> index_t;
+ typedef EdgeIndex<graph_t> index_t;
using PairedInfoIndicesT = omnigraph::de::PairedInfoIndicesT<Graph>;
//typedef omnigraph::de::PairedInfoIndicesT<Graph> PairedInfoIndicesT;
typedef omnigraph::de::UnclusteredPairedInfoIndicesT<Graph> UnclusteredPairedInfoIndicesT;
@@ -151,7 +149,7 @@ struct graph_pack: private boost::noncopyable {
};
-typedef graph_pack<ConjugateDeBruijnGraph, KmerFreeEdgeIndex<Graph, runtime_k::RtSeq, kmer_index_traits<runtime_k::RtSeq>, DefaultStoring>> conj_graph_pack;
+typedef graph_pack<ConjugateDeBruijnGraph> conj_graph_pack;
typedef conj_graph_pack::index_t Index;
typedef conj_graph_pack::PairedInfoIndicesT PairedIndicesT;
diff --git a/src/modules/pipeline/library.hpp b/src/modules/pipeline/library.hpp
index 580fcaf..a183fe9 100644
--- a/src/modules/pipeline/library.hpp
+++ b/src/modules/pipeline/library.hpp
@@ -252,6 +252,7 @@ private:
// Just convenient wrapper to "unwrap" the iterators over libraries.
template<class Data = NoData>
class DataSet {
+public:
typedef SequencingLibrary<Data> Library;
typedef std::vector<Library> LibraryStorage;
@@ -304,7 +305,7 @@ public:
single_reads_iterator reads_end() const {
return single_reads_iterator(libraries_.back().reads_end(), libraries_.back().reads_end());
}
- adt::iterator_range<single_reads_iterator> reads() {
+ adt::iterator_range<single_reads_iterator> reads() const {
return adt::make_range(reads_begin(), reads_end());
}
@@ -320,7 +321,7 @@ public:
single_reads_iterator single_end() const {
return single_reads_iterator(libraries_.back().single_end(), libraries_.back().single_end());
}
- adt::iterator_range<single_reads_iterator> single_reads() {
+ adt::iterator_range<single_reads_iterator> single_reads() const {
return adt::make_range(single_begin(), single_end());
}
diff --git a/src/modules/stages/construction.cpp b/src/modules/stages/construction.cpp
index 86bd711..5702185 100644
--- a/src/modules/stages/construction.cpp
+++ b/src/modules/stages/construction.cpp
@@ -38,7 +38,7 @@ void Construction::run(conj_graph_pack &gp, const char*) {
// Has to be separate stream for not counting it in coverage
io::ReadStreamList<io::SingleRead> trusted_contigs;
if (cfg::get().use_additional_contigs) {
- INFO("Contigs from previous K will be used");
+ DEBUG("Contigs from previous K will be used: " << cfg::get().additional_contigs);
trusted_contigs.push_back(io::EasyStream(cfg::get().additional_contigs, true));
}
@@ -57,12 +57,13 @@ void Construction::run(conj_graph_pack &gp, const char*) {
INFO("Trusted contigs will be used in graph construction");
auto contigs_stream = MultifileWrap(trusted_contigs);
+ auto& dataset = cfg::get_writable().ds;
std::vector<size_t> libs_for_construction;
- for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i)
- if (cfg::get().ds.reads[i].is_graph_contructable())
+ for (size_t i = 0; i < dataset.reads.lib_count(); ++i)
+ if (dataset.reads[i].is_graph_contructable())
libs_for_construction.push_back(i);
- auto streams = single_binary_readers_for_libs(libs_for_construction, true, true);
+ auto streams = single_binary_readers_for_libs(dataset, libs_for_construction, true, true);
construct_graph<io::SingleReadSeq>(streams, gp, contigs_stream);
}
diff --git a/src/modules/stages/simplification.cpp b/src/modules/stages/simplification.cpp
index 665e9d3..cd46d1a 100644
--- a/src/modules/stages/simplification.cpp
+++ b/src/modules/stages/simplification.cpp
@@ -106,7 +106,7 @@ class GraphSimplifier {
"Self conjugate edge remover",
algos);
- if (cfg::get().mode == config::pipeline_type::rna){
+ if (info_container_.mode() == config::pipeline_type::rna){
RemoveShortPolyATEdges(1, removal_handler_, info_container_.chunk_cnt());
PushValid(ShortPolyATEdgesRemoverInstance(g_, 1, removal_handler_, info_container_.chunk_cnt()), "Short PolyA/T Edges",algos) ;
PushValid(ATTipClipperInstance(g_, removal_handler_, info_container_.chunk_cnt()), "AT Tips", algos);
@@ -147,7 +147,7 @@ class GraphSimplifier {
RunAlgos(algos);
//FIXME why called directly?
- if (cfg::get().mode == config::pipeline_type::rna){
+ if (info_container_.mode() == config::pipeline_type::rna){
RemoveHiddenLoopEC(g_, gp_.flanking_cov, info_container_.detected_coverage_bound(), simplif_cfg_.her, removal_handler_);
cnt_callback_.Report();
}
@@ -249,7 +249,7 @@ class GraphSimplifier {
//FIXME need better configuration
- if (cfg::get().mode == config::pipeline_type::meta) {
+ if (info_container_.mode() == config::pipeline_type::meta) {
PushValid(
BRInstance(g_, simplif_cfg_.second_final_br,
info_container_, removal_handler_),
@@ -257,7 +257,7 @@ class GraphSimplifier {
algos);
}
- if (cfg::get().mode == config::pipeline_type::rna) {
+ if (info_container_.mode() == config::pipeline_type::rna) {
PushValid(ATTipClipperInstance(g_, removal_handler_, info_container_.chunk_cnt()), "AT Tips", algos);
}
@@ -382,8 +382,65 @@ public:
INFO("PostSimplification disabled");
}
}
+
+ void SimplifyRNAGraph() {
+ printer_(info_printer_pos::before_simplification);
+ INFO("Graph simplification started");
+
+ InitialCleaning();
+
+ if (gp_.genome.GetSequence().size() > 0) {
+ DEBUG("Reference genome length = " + std::to_string(gp_.genome.GetSequence().size()));
+ }
+
+ AlgoStorageT ec_algo;
+
+ PushValid(ECRemoverInstance(g_, simplif_cfg_.ec, info_container_, removal_handler_,
+ simplif_cfg_.cycle_iter_count), "Low coverage edge remover", ec_algo);
+
+ size_t iteration = 0;
+ bool graph_changed_ec = true;
+ //TODO: config. Or just graph_changed?
+ size_t tc_max_iteration = 2;
+ //cannot stop simply if nothing changed, since threshold change on every iteration
+ while (iteration < simplif_cfg_.cycle_iter_count || graph_changed_ec) {
+ AlgoStorageT algos;
+ PushValid(
+ TipClipperInstance(g_, simplif_cfg_.tc, info_container_, removal_handler_, tc_max_iteration),
+ "Tip clipper",
+ algos);
+ PushValid(
+ DeadEndInstance(g_, simplif_cfg_.dead_end, info_container_, removal_handler_, tc_max_iteration),
+ "Dead end clipper",
+ algos);
+ PushValid(
+ BRInstance(g_, simplif_cfg_.br, info_container_, removal_handler_, tc_max_iteration),
+ "Bulge remover",
+ algos);
+ bool graph_changed = true;
+ size_t tc_iteration = 0;
+
+ while (tc_iteration < tc_max_iteration || graph_changed) {
+ INFO("PROCEDURE == Tip clipper and bulge removal cycle, iteration " << iteration + 1 << "." << tc_iteration);
+ graph_changed = RunAlgos(algos);
+ ++tc_iteration;
+ }
+ INFO("PROCEDURE == Erroneous connection, iteration " << iteration + 1);
+ graph_changed_ec = RunAlgos(ec_algo);
+ ++iteration;
+ }
+
+ printer_(info_printer_pos::before_post_simplification);
+
+ if (simplif_cfg_.post_simplif_enabled) {
+ PostSimplification();
+ } else {
+ INFO("PostSimplification disabled");
+ }
+ }
};
+
void Simplification::run(conj_graph_pack &gp, const char*) {
using namespace omnigraph;
@@ -407,7 +464,7 @@ void Simplification::run(conj_graph_pack &gp, const char*) {
// boost::ref(qual_removal_handler), _1);
- SimplifInfoContainer info_container;
+ SimplifInfoContainer info_container(cfg::get().mode);
info_container.set_read_length(cfg::get().ds.RL())
.set_main_iteration(cfg::get().main_iteration)
.set_chunk_cnt(5 * cfg::get().max_threads);
@@ -419,21 +476,29 @@ void Simplification::run(conj_graph_pack &gp, const char*) {
.set_detected_coverage_bound(gp.ginfo.ec_bound());
GraphSimplifier simplifier(gp, info_container,
- preliminary_ ? *cfg::get().preliminary_simp : cfg::get().simp,
- nullptr/*removal_handler_f*/,
- printer);
- simplifier.SimplifyGraph();
+ preliminary_ ? *cfg::get().preliminary_simp : cfg::get().simp,
+ nullptr/*removal_handler_f*/,
+ printer);
+ if (cfg::get().mode == pipeline_type::rna)
+ simplifier.SimplifyRNAGraph();
+ else
+ simplifier.SimplifyGraph();
+
}
void SimplificationCleanup::run(conj_graph_pack &gp, const char*) {
- SimplifInfoContainer info_container;
+ SimplifInfoContainer info_container(cfg::get().mode);
info_container
.set_read_length(cfg::get().ds.RL())
.set_main_iteration(cfg::get().main_iteration)
.set_chunk_cnt(5 * cfg::get().max_threads);
- IsolatedEdgeRemoverInstance(gp.g, cfg::get().simp.ier, info_container, (HandlerF<Graph>)nullptr)->Run();
+
+ auto isolated_edge_remover =
+ IsolatedEdgeRemoverInstance(gp.g, cfg::get().simp.ier, info_container, (HandlerF<Graph>)nullptr);
+ if (isolated_edge_remover != nullptr)
+ isolated_edge_remover->Run();
double low_threshold = gp.ginfo.trusted_bound();
if (math::gr(low_threshold, 0.0)) {
diff --git a/src/modules/stages/simplification_pipeline/graph_simplification.hpp b/src/modules/stages/simplification_pipeline/graph_simplification.hpp
index cd9d9d4..013443e 100644
--- a/src/modules/stages/simplification_pipeline/graph_simplification.hpp
+++ b/src/modules/stages/simplification_pipeline/graph_simplification.hpp
@@ -58,7 +58,7 @@ class ConditionParser {
private:
typedef typename Graph::EdgeId EdgeId;
- const Graph& g_;
+ const Graph &g_;
string next_token_;
string input_;
const SimplifInfoContainer settings_;
@@ -80,7 +80,7 @@ private:
}
template<typename T>
- bool RelaxMax(T& cur_max, T t) {
+ bool RelaxMax(T &cur_max, T t) {
if (t > cur_max) {
cur_max = t;
return true;
@@ -89,7 +89,7 @@ private:
}
template<typename T>
- bool RelaxMin(T& cur_min, T t) {
+ bool RelaxMin(T &cur_min, T t) {
if (t < cur_min) {
cur_min = t;
return true;
@@ -105,8 +105,8 @@ private:
}
}
- pred::TypedPredicate<EdgeId> ParseCondition(size_t& min_length_bound,
- double& min_coverage_bound) {
+ pred::TypedPredicate<EdgeId> ParseCondition(size_t &min_length_bound,
+ double &min_coverage_bound) {
if (next_token_ == "tc_lb") {
double length_coeff = std::stod(ReadNext());
@@ -142,7 +142,7 @@ private:
RelaxMin(min_length_bound, length_bound);
DEBUG("Min length bound - " << min_length_bound);
return LengthUpperBound<Graph>(g_, length_bound);
-
+
} else if (next_token_ == "ec_lb") {
size_t length_coeff = std::stoll(ReadNext());
@@ -195,8 +195,8 @@ private:
}
}
- pred::TypedPredicate<EdgeId> ParseConjunction(size_t& min_length_bound,
- double& min_coverage_bound) {
+ pred::TypedPredicate<EdgeId> ParseConjunction(size_t &min_length_bound,
+ double &min_coverage_bound) {
pred::TypedPredicate<EdgeId> answer = pred::AlwaysTrue<EdgeId>();
VERIFY(next_token_ == "{");
ReadNext();
@@ -210,7 +210,7 @@ private:
public:
- ConditionParser(const Graph& g, string input, const SimplifInfoContainer& settings,
+ ConditionParser(const Graph &g, string input, const SimplifInfoContainer &settings,
size_t curr_iteration = -1ul, size_t iteration_cnt = -1ul)
: g_(g),
input_(input),
@@ -261,10 +261,10 @@ private:
//todo move to visualization
template<class graph_pack>
shared_ptr<omnigraph::visualization::GraphColorer<typename graph_pack::graph_t>> DefaultGPColorer(
- const graph_pack& gp) {
+ const graph_pack &gp) {
auto mapper = MapperInstance(gp);
- auto path1 = mapper->MapSequence(gp.genome).path();
- auto path2 = mapper->MapSequence(!gp.genome).path();
+ auto path1 = mapper->MapSequence(gp.genome.GetSequence()).path();
+ auto path2 = mapper->MapSequence(!gp.genome.GetSequence()).path();
return omnigraph::visualization::DefaultColorer(gp.g, path1, path2);
}
@@ -272,14 +272,14 @@ template<class Graph>
class EditDistanceTrackingCallback {
typedef typename Graph::EdgeId EdgeId;
typedef typename Graph::EdgeData EdgeData;
- const Graph& g_;
+ const Graph &g_;
public:
- EditDistanceTrackingCallback(const Graph& g)
+ EditDistanceTrackingCallback(const Graph &g)
: g_(g) {
}
- bool operator()(EdgeId edge, const vector<EdgeId>& path) const {
+ bool operator()(EdgeId edge, const vector<EdgeId> &path) const {
vector<Sequence> path_sequences;
for (auto it = path.begin(); it != path.end(); ++it) {
path_sequences.push_back(g_.EdgeNucls(*it));
@@ -292,16 +292,15 @@ public:
}
private:
- DECL_LOGGER("EditDistanceTrackingCallback")
- ;
+ DECL_LOGGER("EditDistanceTrackingCallback");
};
//template<class Graph, class SmartEdgeIt>
//bool ClipTips(
-// Graph& g,
-// SmartEdgeIt& it,
-// const config::debruijn_config::simplification::tip_clipper& tc_config,
-// const SimplifInfoContainer& info,
+// Graph &g,
+// SmartEdgeIt &it,
+// const config::debruijn_config::simplification::tip_clipper &tc_config,
+// const SimplifInfoContainer &info,
// std::function<void(typename Graph::EdgeId)> removal_handler = 0) {
//
// INFO("Clipping tips");
@@ -322,9 +321,9 @@ private:
//template<class Graph>
//bool ClipTips(
-// Graph& g,
-// const config::debruijn_config::simplification::tip_clipper& tc_config,
-// const SimplifInfoContainer& info,
+// Graph &g,
+// const config::debruijn_config::simplification::tip_clipper &tc_config,
+// const SimplifInfoContainer &info,
// std::function<void(typename Graph::EdgeId)> removal_handler = 0) {
//
// auto it = g.SmartEdgeBegin(LengthComparator<Graph>(g), true);
@@ -334,7 +333,7 @@ private:
//enabling tip projection, todo optimize if hotspot
template<class gp_t>
HandlerF<typename gp_t::graph_t> WrapWithProjectionCallback(
- gp_t& gp,
+ gp_t &gp,
HandlerF<typename gp_t::graph_t> removal_handler) {
typedef typename gp_t::graph_t Graph;
typedef typename Graph::EdgeId EdgeId;
@@ -377,10 +376,10 @@ protected:
}
public:
- LowCoverageEdgeRemovingAlgorithm(Graph& g,
- const InterestingEdgeFinder& interest_edge_finder,
- const SimplifInfoContainer& simplif_info,
- const std::string& condition_str,
+ LowCoverageEdgeRemovingAlgorithm(Graph &g,
+ const InterestingEdgeFinder &interest_edge_finder,
+ const SimplifInfoContainer &simplif_info,
+ const std::string &condition_str,
std::function<void(EdgeId)> removal_handler = nullptr,
bool canonical_only = false,
bool track_changes = true,
@@ -395,13 +394,14 @@ public:
condition_str_(condition_str),
remove_condition_(pred::AlwaysFalse<EdgeId>()),
proceed_condition_(pred::AlwaysTrue<EdgeId>()) {}
+
private:
DECL_LOGGER("LowCoverageEdgeRemovingAlgorithm");
};
template<class Graph>
-AlternativesAnalyzer<Graph> ParseBRConfig(const Graph& g,
- const config::debruijn_config::simplification::bulge_remover& config) {
+AlternativesAnalyzer<Graph> ParseBRConfig(const Graph &g,
+ const config::debruijn_config::simplification::bulge_remover &config) {
size_t max_length = LengthThresholdFinder::MaxBulgeLength(
g.k(), config.max_bulge_length_coefficient,
config.max_additive_length_coefficient);
@@ -417,12 +417,12 @@ AlternativesAnalyzer<Graph> ParseBRConfig(const Graph& g,
}
template<class Graph>
-AlgoPtr<Graph> SelfConjugateEdgeRemoverInstance(Graph &g, const string& condition_str,
- const SimplifInfoContainer& info,
+AlgoPtr<Graph> SelfConjugateEdgeRemoverInstance(Graph &g, const string &condition_str,
+ const SimplifInfoContainer &info,
HandlerF<Graph> removal_handler = 0) {
ConditionParser<Graph> parser(g, condition_str, info);
auto condition = pred::And(SelfConjugateCondition<Graph>(g), parser());
-
+
return std::make_shared<ParallelEdgeRemovingAlgorithm<Graph>>(g,
condition,
info.chunk_cnt(),
@@ -433,19 +433,19 @@ AlgoPtr<Graph> SelfConjugateEdgeRemoverInstance(Graph &g, const string& conditio
template<class Graph>
bool RemoveRelativelyLowCoverageComponents(
Graph &g,
- const FlankingCoverage<Graph>& flanking_cov,
- const config::debruijn_config::simplification::relative_coverage_comp_remover& rcc_config,
- const SimplifInfoContainer& info,
+ const FlankingCoverage<Graph> &flanking_cov,
+ const config::debruijn_config::simplification::relative_coverage_comp_remover &rcc_config,
+ const SimplifInfoContainer &info,
typename ComponentRemover<Graph>::HandlerF removal_handler = 0) {
if (rcc_config.enabled) {
INFO("Removing relatively low covered connections");
size_t connecting_path_length_bound = LengthThresholdFinder::MaxErroneousConnectionLength(
g.k(), rcc_config.max_ec_length_coefficient);
- std::string pics_dir = "";//cfg::get().output_dir + "rel_cov_components/"
+ std::string pics_dir = "";
- double max_coverage = math::ge(rcc_config.max_coverage_coeff, 0.)
- ? info.detected_coverage_bound() * rcc_config.max_coverage_coeff
+ double max_coverage = math::ge(rcc_config.max_coverage_coeff, 0.)
+ ? info.detected_coverage_bound() * rcc_config.max_coverage_coeff
: std::numeric_limits<double>::max();
omnigraph::simplification::relative_coverage::
@@ -467,8 +467,8 @@ bool RemoveRelativelyLowCoverageComponents(
template<class Graph>
bool DisconnectRelativelyLowCoverageEdges(Graph &g,
- const FlankingCoverage<Graph>& flanking_cov,
- const config::debruijn_config::simplification::relative_coverage_edge_disconnector& rced_config) {
+ const FlankingCoverage<Graph> &flanking_cov,
+ const config::debruijn_config::simplification::relative_coverage_edge_disconnector &rced_config) {
if (rced_config.enabled) {
INFO("Disconnecting edges with relatively low coverage");
omnigraph::simplification::relative_coverage::RelativeCoverageDisconnector<
@@ -484,7 +484,7 @@ bool DisconnectRelativelyLowCoverageEdges(Graph &g,
template<class Graph>
bool RemoveComplexBulges(
- Graph& g,
+ Graph &g,
config::debruijn_config::simplification::complex_bulge_remover cbr_config,
size_t /*iteration*/ = 0) {
if (!cbr_config.enabled)
@@ -531,8 +531,8 @@ bool RemoveComplexBulges(
//}
template<class Graph>
-bool ClipComplexTips(Graph& g, config::debruijn_config::simplification::complex_tip_clipper ctc_conf, const SimplifInfoContainer& info, HandlerF<Graph> removal_handler = 0) {
- if(!ctc_conf.enabled) {
+bool ClipComplexTips(Graph &g, config::debruijn_config::simplification::complex_tip_clipper ctc_conf, const SimplifInfoContainer &info, HandlerF<Graph> removal_handler = 0) {
+ if (!ctc_conf.enabled) {
INFO("Complex tip clipping disabled");
return false;
}
@@ -553,13 +553,13 @@ bool ClipComplexTips(Graph& g, config::debruijn_config::simplification::complex_
}
template<class Graph>
-AlgoPtr<Graph> ShortPolyATEdgesRemoverInstance (Graph &g, size_t max_length, HandlerF<Graph> removal_handler = 0, size_t chunk_cnt = 1){
+AlgoPtr<Graph> ShortPolyATEdgesRemoverInstance(Graph &g, size_t max_length, HandlerF<Graph> removal_handler = 0, size_t chunk_cnt = 1) {
auto condition = pred::And(ATCondition<Graph>(g, 0.8, max_length, false), LengthUpperBound<Graph>(g, 1));
return std::make_shared<ParallelEdgeRemovingAlgorithm<Graph>>(g, condition, chunk_cnt, removal_handler, true);
}
template<class Graph>
-AlgoPtr<Graph> ATTipClipperInstance (Graph &g, HandlerF<Graph> removal_handler = 0, size_t chunk_cnt = 1) {
+AlgoPtr<Graph> ATTipClipperInstance(Graph &g, HandlerF<Graph> removal_handler = 0, size_t chunk_cnt = 1) {
//TODO: review params 0.8, 200?
return std::make_shared<ParallelEdgeRemovingAlgorithm<Graph>>(g, ATCondition<Graph>(g, 0.8, 200, true), chunk_cnt, removal_handler, true);
}
@@ -567,7 +567,7 @@ AlgoPtr<Graph> ATTipClipperInstance (Graph &g, HandlerF<Graph> removal_handler =
template<class Graph>
AlgoPtr<Graph> IsolatedEdgeRemoverInstance(Graph &g,
config::debruijn_config::simplification::isolated_edges_remover ier,
- const SimplifInfoContainer& info,
+ const SimplifInfoContainer &info,
HandlerF<Graph> removal_handler = 0) {
if (!ier.enabled) {
return nullptr;
@@ -591,17 +591,17 @@ AlgoPtr<Graph> IsolatedEdgeRemoverInstance(Graph &g,
}
template<class Graph>
-pred::TypedPredicate<typename Graph::EdgeId> NecessaryBulgeCondition(const Graph& g,
- const config::debruijn_config::simplification::bulge_remover& br_config,
+pred::TypedPredicate<typename Graph::EdgeId> NecessaryBulgeCondition(const Graph &g,
+ const config::debruijn_config::simplification::bulge_remover &br_config,
const SimplifInfoContainer&) {
auto analyzer = ParseBRConfig(g, br_config);
return omnigraph::NecessaryBulgeCondition(g, analyzer.max_length(), analyzer.max_coverage());
}
template<class Graph>
-pred::TypedPredicate<typename Graph::EdgeId> NecessaryTipCondition(const Graph& g,
- const config::debruijn_config::simplification::tip_clipper& tc_config,
- const SimplifInfoContainer& info) {
+pred::TypedPredicate<typename Graph::EdgeId> NecessaryTipCondition(const Graph &g,
+ const config::debruijn_config::simplification::tip_clipper &tc_config,
+ const SimplifInfoContainer &info) {
ConditionParser<Graph> parser(g, tc_config.condition, info);
auto condition = parser();
return omnigraph::NecessaryTipCondition(g, parser.max_length_bound(),
@@ -609,9 +609,11 @@ pred::TypedPredicate<typename Graph::EdgeId> NecessaryTipCondition(const Graph&
}
template<class Graph>
-pred::TypedPredicate<typename Graph::EdgeId> NecessaryECCondition(const Graph& g,
- const config::debruijn_config::simplification::erroneous_connections_remover& ec_config,
- const SimplifInfoContainer& info, size_t current_iteration = 0, size_t iteration_cnt = 1) {
+pred::TypedPredicate<typename Graph::EdgeId> NecessaryECCondition(const Graph &g,
+ const config::debruijn_config::simplification::erroneous_connections_remover &ec_config,
+ const SimplifInfoContainer &info,
+ size_t current_iteration = 0,
+ size_t iteration_cnt = 1) {
ConditionParser<Graph> parser(g, ec_config.condition, info, current_iteration, iteration_cnt);
auto condition = parser();
return omnigraph::NecessaryECCondition(g, parser.max_length_bound(),
@@ -619,9 +621,9 @@ pred::TypedPredicate<typename Graph::EdgeId> NecessaryECCondition(const Graph& g
}
template<class Graph>
-AlgoPtr<Graph> ECRemoverInstance(Graph& g,
- const config::debruijn_config::simplification::erroneous_connections_remover& ec_config,
- const SimplifInfoContainer& info,
+AlgoPtr<Graph> ECRemoverInstance(Graph &g,
+ const config::debruijn_config::simplification::erroneous_connections_remover &ec_config,
+ const SimplifInfoContainer &info,
HandlerF<Graph> removal_handler,
size_t iteration_cnt = 1) {
if (ec_config.condition.empty())
@@ -637,9 +639,47 @@ AlgoPtr<Graph> ECRemoverInstance(Graph& g,
}
template<class Graph>
-AlgoPtr<Graph> TipClipperInstance(Graph& g,
- const EdgeConditionT<Graph>& condition,
- const SimplifInfoContainer& info,
+AlgoPtr<Graph> RelativeECRemoverInstance(Graph &g,
+ const config::debruijn_config::simplification::relative_coverage_ec_remover &rcec_config,
+ const SimplifInfoContainer &info,
+ HandlerF<Graph> removal_handler,
+ size_t iteration_cnt = 1) {
+ if (!rcec_config.enabled)
+ return nullptr;
+
+ return make_shared<ParallelEdgeRemovingAlgorithm<Graph>>(g,
+ AddRelativeCoverageECCondition(g, rcec_config.rcec_ratio,
+ AddAlternativesPresenceCondition(g, pred::TypedPredicate<typename Graph::EdgeId>
+ (LengthUpperBound<Graph>(g, rcec_config.max_ec_length)))),
+ info.chunk_cnt(), removal_handler, /*canonical_only*/true);
+}
+
+template<class Graph>
+AlgoPtr<Graph> NotBulgeECRemoverInstance(Graph &g,
+ const config::debruijn_config::simplification::erroneous_connections_remover &ec_config,
+ const SimplifInfoContainer &info, HandlerF<Graph> removal_handler,
+ size_t iteration_cnt = 1) {
+ if (ec_config.condition.empty())
+ return nullptr;
+
+ std::string curr_condition = ec_config.condition;
+ ConditionParser<Graph> parser(g, curr_condition, info, iteration_cnt - 1, iteration_cnt);
+ auto condition = parser();
+
+ typedef omnigraph::ParallelInterestingElementFinder<Graph> InterestingFinderT;
+ InterestingFinderT interesting_finder(g, AddNotBulgeECCondition(g, AddAlternativesPresenceCondition(g, pred::And(
+ LengthUpperBound<Graph>(g, parser.max_length_bound()),
+ CoverageUpperBound<Graph>(g, parser.max_coverage_bound())))),
+ info.chunk_cnt());
+ return make_shared<LowCoverageEdgeRemovingAlgorithm<Graph, InterestingFinderT>>(
+ g, interesting_finder, info, ec_config.condition, removal_handler,
+ /*canonical only*/ true, /*track changes*/ true, iteration_cnt);
+}
+
+template<class Graph>
+AlgoPtr<Graph> TipClipperInstance(Graph &g,
+ const EdgeConditionT<Graph> &condition,
+ const SimplifInfoContainer &info,
HandlerF<Graph> removal_handler,
bool track_changes = true,
size_t /*iteration_cnt*/ = 1) {
@@ -653,9 +693,9 @@ AlgoPtr<Graph> TipClipperInstance(Graph& g,
}
template<class Graph>
-AlgoPtr<Graph> TipClipperInstance(Graph& g,
- const config::debruijn_config::simplification::tip_clipper& tc_config,
- const SimplifInfoContainer& info,
+AlgoPtr<Graph> TipClipperInstance(Graph &g,
+ const config::debruijn_config::simplification::tip_clipper &tc_config,
+ const SimplifInfoContainer &info,
HandlerF<Graph> removal_handler,
size_t iteration_cnt = 1) {
if (tc_config.condition.empty())
@@ -667,10 +707,26 @@ AlgoPtr<Graph> TipClipperInstance(Graph& g,
}
template<class Graph>
+AlgoPtr<Graph> DeadEndInstance(Graph &g,
+ const config::debruijn_config::simplification::dead_end_clipper &dead_end_config,
+ const SimplifInfoContainer &info,
+ HandlerF<Graph> removal_handler,
+ size_t /*iteration_cnt*/ = 1) {
+ if (!dead_end_config.enabled || dead_end_config.condition.empty())
+ return nullptr;
+
+ ConditionParser<Graph> parser(g, dead_end_config.condition, info);
+ auto condition = parser();
+ return make_shared<ParallelEdgeRemovingAlgorithm<Graph, LengthComparator<Graph>>>(g,
+ AddDeadEndCondition(g, condition), info.chunk_cnt(), removal_handler, /*canonical_only*/true,
+ LengthComparator<Graph>(g), /*track changes*/true);
+}
+
+template<class Graph>
AlgoPtr<Graph> TopologyTipClipperInstance(
Graph &g,
- const config::debruijn_config::simplification::topology_tip_clipper& ttc_config,
- const SimplifInfoContainer& info,
+ const config::debruijn_config::simplification::topology_tip_clipper &ttc_config,
+ const SimplifInfoContainer &info,
HandlerF<Graph> removal_handler) {
auto condition
@@ -684,12 +740,12 @@ AlgoPtr<Graph> TopologyTipClipperInstance(
}
template<class Graph>
-AlgoPtr<Graph> BRInstance(Graph& g,
- const config::debruijn_config::simplification::bulge_remover& br_config,
- const SimplifInfoContainer& info,
+AlgoPtr<Graph> BRInstance(Graph &g,
+ const config::debruijn_config::simplification::bulge_remover &br_config,
+ const SimplifInfoContainer &info,
HandlerF<Graph> removal_handler,
size_t /*iteration_cnt*/ = 1) {
- typedef ParallelInterestingElementFinder<Graph,
+ typedef ParallelInterestingElementFinder<Graph,
typename Graph::EdgeId> InterestingEdgeFinder;
if (!br_config.enabled || (br_config.main_iteration_only && !info.main_iteration())) {
return nullptr;
@@ -697,11 +753,11 @@ AlgoPtr<Graph> BRInstance(Graph& g,
auto alternatives_analyzer = ParseBRConfig(g, br_config);
-
+
InterestingEdgeFinder interesting_edge_finder(g,
NecessaryBulgeCondition(g,
alternatives_analyzer.max_length(),
- alternatives_analyzer.max_coverage()),
+ alternatives_analyzer.max_coverage()),
info.chunk_cnt());
if (br_config.parallel) {
INFO("Creating parallel br instance");
@@ -731,11 +787,11 @@ template<class Graph>
class FlankingCovBound : public EdgeCondition<Graph> {
typedef EdgeCondition<Graph> base;
typedef typename Graph::EdgeId EdgeId;
- const FlankingCoverage<Graph>& flanking_cov_;
+ const FlankingCoverage<Graph> &flanking_cov_;
double max_coverage_;
public:
- FlankingCovBound(const Graph& g,
- const FlankingCoverage<Graph>& flanking_cov,
+ FlankingCovBound(const Graph &g,
+ const FlankingCoverage<Graph> &flanking_cov,
double max_coverage)
: base(g),
flanking_cov_(flanking_cov),
@@ -743,8 +799,8 @@ public:
}
bool Check(EdgeId e) const override {
- return this->g().length(e) > 1
- && this->g().OutgoingEdgeCount(this->g().EdgeStart(e)) > 1
+ return this->g().length(e) > 1
+ && this->g().OutgoingEdgeCount(this->g().EdgeStart(e)) > 1
&& math::le(flanking_cov_.CoverageOfStart(e), max_coverage_);
}
@@ -761,11 +817,11 @@ class ParallelDisconnectionAlgorithm : public PersistentProcessingAlgorithm<Grap
omnigraph::simplification::relative_coverage::EdgeDisconnector<Graph> disconnector_;
public:
- ParallelDisconnectionAlgorithm(Graph& g,
+ ParallelDisconnectionAlgorithm(Graph &g,
pred::TypedPredicate<EdgeId> condition,
size_t chunk_cnt,
HandlerF<Graph> removal_handler,
- const Comparator& comp = Comparator(),
+ const Comparator &comp = Comparator(),
bool track_changes = true)
: base(g,
ParallelInterestingElementFinder<Graph>(g, condition, chunk_cnt),
@@ -785,10 +841,10 @@ public:
};
template<class Graph>
-AlgoPtr<Graph> LowFlankDisconnectorInstance(Graph& g,
- const FlankingCoverage<Graph>& flanking_cov,
+AlgoPtr<Graph> LowFlankDisconnectorInstance(Graph &g,
+ const FlankingCoverage<Graph> &flanking_cov,
double cov_bound,
- const SimplifInfoContainer& info,
+ const SimplifInfoContainer &info,
HandlerF<Graph> removal_handler) {
if (math::ls(cov_bound, 0.)) {
INFO("Flanking coverage based disconnection disabled");
@@ -802,8 +858,8 @@ AlgoPtr<Graph> LowFlankDisconnectorInstance(Graph& g,
}
template<class Graph>
-bool RemoveHiddenLoopEC(Graph& g,
- const FlankingCoverage<Graph>& flanking_cov,
+bool RemoveHiddenLoopEC(Graph &g,
+ const FlankingCoverage<Graph> &flanking_cov,
double determined_coverage_threshold,
config::debruijn_config::simplification::hidden_ec_remover her_config,
HandlerF<Graph> removal_handler) {
@@ -811,7 +867,7 @@ bool RemoveHiddenLoopEC(Graph& g,
INFO("Removing loops and rc loops with erroneous connections");
ECLoopRemover<Graph> hc(g, flanking_cov,
determined_coverage_threshold,
- cfg::get().simp.her.relative_threshold, removal_handler);
+ her_config.relative_threshold, removal_handler);
bool res = hc.Run();
hc.PrintLoopStats();
return res;
@@ -823,9 +879,9 @@ bool RemoveHiddenLoopEC(Graph& g,
////todo add chunk_cnt
//template<class Graph>
//bool ClipTips(
-// Graph& g,
-// const std::string& condition,
-// const SimplifInfoContainer& info,
+// Graph &g,
+// const std::string &condition,
+// const SimplifInfoContainer &info,
// std::function<void(typename Graph::EdgeId)> removal_handler = 0) {
//
// if (condition != "") {
@@ -845,9 +901,9 @@ bool RemoveHiddenLoopEC(Graph& g,
//template<class Graph>
//bool RemoveLowCoverageEdges(
-// Graph& g,
-// const std::string& condition,
-// const SimplifInfoContainer& info,
+// Graph &g,
+// const std::string &condition,
+// const SimplifInfoContainer &info,
// std::function<void(typename Graph::EdgeId)> removal_handler = 0) {
//
// if (condition != "") {
@@ -870,7 +926,7 @@ bool RemoveHiddenLoopEC(Graph& g,
//Parallel algo launch
template<class Graph>
-void ParallelCompress(Graph& g, size_t chunk_cnt, bool loop_post_compression = true) {
+void ParallelCompress(Graph &g, size_t chunk_cnt, bool loop_post_compression = true) {
INFO("Parallel compression");
debruijn::simplification::ParallelCompressor<Graph> compressor(g);
TwoStepAlgorithmRunner<Graph, typename Graph::VertexId> runner(g, false);
@@ -886,9 +942,9 @@ void ParallelCompress(Graph& g, size_t chunk_cnt, bool loop_post_compression = t
}
template<class Graph>
-bool ParallelClipTips(Graph& g,
- const string& tip_condition,
- const SimplifInfoContainer& info,
+bool ParallelClipTips(Graph &g,
+ const string &tip_condition,
+ const SimplifInfoContainer &info,
std::function<void(typename Graph::EdgeId)> removal_handler = 0) {
INFO("Parallel tip clipping");
@@ -913,8 +969,8 @@ bool ParallelClipTips(Graph& g,
}
//template<class Graph>
-//bool ParallelRemoveBulges(Graph& g,
-// const config::debruijn_config::simplification::bulge_remover& br_config,
+//bool ParallelRemoveBulges(Graph &g,
+// const config::debruijn_config::simplification::bulge_remover &br_config,
// size_t /*read_length*/,
// std::function<void(typename Graph::EdgeId)> removal_handler = 0) {
// INFO("Parallel bulge remover");
@@ -941,9 +997,9 @@ bool ParallelClipTips(Graph& g,
//}
template<class Graph>
-bool ParallelEC(Graph& g,
- const string& ec_condition,
- const SimplifInfoContainer& info,
+bool ParallelEC(Graph &g,
+ const string &ec_condition,
+ const SimplifInfoContainer &info,
std::function<void(typename Graph::EdgeId)> removal_handler = 0) {
INFO("Parallel ec remover");
diff --git a/src/modules/stages/simplification_pipeline/simplification_settings.hpp b/src/modules/stages/simplification_pipeline/simplification_settings.hpp
index 9b074a0..efaf4d6 100644
--- a/src/modules/stages/simplification_pipeline/simplification_settings.hpp
+++ b/src/modules/stages/simplification_pipeline/simplification_settings.hpp
@@ -6,6 +6,7 @@
//***************************************************************************
#pragma once
+#include "modules/pipeline/config_struct.hpp"
namespace debruijn {
@@ -40,14 +41,16 @@ class SimplifInfoContainer {
double detected_coverage_bound_;
bool main_iteration_;
size_t chunk_cnt_;
+ debruijn_graph::config::pipeline_type mode_;
public:
- SimplifInfoContainer() :
+ SimplifInfoContainer(debruijn_graph::config::pipeline_type mode) :
read_length_(-1ul),
detected_mean_coverage_(-1.0),
detected_coverage_bound_(-1.0),
main_iteration_(false),
- chunk_cnt_(-1ul) {
+ chunk_cnt_(-1ul),
+ mode_(mode) {
}
size_t read_length() const {
@@ -74,6 +77,10 @@ public:
return chunk_cnt_;
}
+ debruijn_graph::config::pipeline_type mode() const {
+ return mode_;
+ }
+
SimplifInfoContainer& set_read_length(size_t read_length) {
read_length_ = read_length;
return *this;
diff --git a/src/modules/visualization/position_filler.hpp b/src/modules/visualization/position_filler.hpp
index db088bb..406d679 100644
--- a/src/modules/visualization/position_filler.hpp
+++ b/src/modules/visualization/position_filler.hpp
@@ -10,6 +10,8 @@
#include "assembly_graph/graph_alignment/sequence_mapper.hpp"
#include "assembly_graph/handlers/edges_position_handler.hpp"
#include "io/reads_io/wrapper_collection.hpp"
+#include "io/reads_io/easy_reader.hpp"
+#include "io/reads_io/io_helper.hpp"
namespace debruijn_graph {
@@ -78,7 +80,7 @@ private:
template<class gp_t>
void FillPos(gp_t &gp, const string &contig_file, string prefix, bool with_rc = false) {
PosFiller<typename gp_t::graph_t> pos_filler(gp.g, MapperInstance(gp), gp.edge_pos);
- auto irs = std::make_shared<io::PrefixAddingReaderWrapper>(io::EasyStream(contig_file, with_rc), prefix);
+ auto irs = std::make_shared<io::PrefixAddingReaderWrapper>(io::EasyStream(contig_file, with_rc, false), prefix);
pos_filler.Process(*irs);
}
diff --git a/src/modules/visualization/visualization_utils.hpp b/src/modules/visualization/visualization_utils.hpp
index 1c03492..72d4f74 100644
--- a/src/modules/visualization/visualization_utils.hpp
+++ b/src/modules/visualization/visualization_utils.hpp
@@ -34,10 +34,10 @@ void WriteComponents(const Graph& g,
}
template<class Graph>
-void DrawComponentsOfShortEdges(const Graph& g, size_t min_length, size_t sinks, size_t sources)
+void DrawComponentsOfShortEdges(const Graph& g, const string &output_dir, size_t min_length, size_t sinks, size_t sources)
{
vector<typename Graph::EdgeId> short_edges;
- std::string pics_folder_ = cfg::get().output_dir + ToString(min_length) + "_" + ToString(sinks) + "_" + ToString(sources) + "_"+ "pics_polymorphic/";
+ std::string pics_folder_ = output_dir + ToString(min_length) + "_" + ToString(sinks) + "_" + ToString(sources) + "_"+ "pics_polymorphic/";
make_dir(pics_folder_);
INFO("Writing pics with components consisting of short edges to " + pics_folder_);
shared_ptr<GraphSplitter<Graph>> splitter = LongEdgesExclusiveSplitter<Graph>(g, min_length);
@@ -172,7 +172,7 @@ public:
labeler_(labeler),
colorer_(colorer),
output_folder_(output_folder) {
- path::make_dirs(output_folder_);
+// path::make_dirs(output_folder_);
}
void HandleDelete(EdgeId e, const string& add_label = "") {
@@ -180,11 +180,11 @@ public:
// map<EdgeId, string> empty_coloring;
auto edge_colorer = make_shared<visualization::CompositeEdgeColorer<Graph>>("black");
edge_colorer->AddColorer(colorer_);
- edge_colorer->AddColorer(make_shared<visualization::SetColorer<Graph>>(this->g(), vector<EdgeId>(1, e), "green"));
+ edge_colorer->AddColorer(make_shared<visualization::SetColorer<Graph>>(g_, vector<EdgeId>(1, e), "green"));
shared_ptr<visualization::GraphColorer<Graph>> resulting_colorer = make_shared<visualization::CompositeGraphColorer<Graph>>(colorer_, edge_colorer);
- string fn = output_folder_ + "edge_" + ToString(this->g().int_id(e)) + add_label + ".dot";
- omnigraph::visualization::WriteComponent(omnigraph::EdgeNeighborhood<Graph>(this->g(), e, 50, 250)
+ string fn = output_folder_ + "/edge_" + ToString(g_.int_id(e)) + add_label + ".dot";
+ omnigraph::visualization::WriteComponent(omnigraph::EdgeNeighborhood<Graph>(g_, e, 50, 250)
, fn
, resulting_colorer, labeler_);
}
diff --git a/src/projects/CMakeLists.txt b/src/projects/CMakeLists.txt
index 7888a79..4fd1f77 100644
--- a/src/projects/CMakeLists.txt
+++ b/src/projects/CMakeLists.txt
@@ -10,4 +10,13 @@ add_subdirectory(hammer)
add_subdirectory(ionhammer)
add_subdirectory(dipspades)
add_subdirectory(corrector)
-add_subdirectory(scaffold_correction)
\ No newline at end of file
+add_subdirectory(scaffold_correction)
+
+if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
+ # Require at least gcc 4.8
+ if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8)
+ add_subdirectory(mph_test)
+ endif()
+else()
+ add_subdirectory(mph_test)
+endif()
\ No newline at end of file
diff --git a/src/projects/cap/cap_kmer_index.hpp b/src/projects/cap/cap_kmer_index.hpp
index 5275554..feab11a 100644
--- a/src/projects/cap/cap_kmer_index.hpp
+++ b/src/projects/cap/cap_kmer_index.hpp
@@ -165,17 +165,6 @@ namespace cap {
bucket = NULL;
}
- virtual RawKMerStorage* TransferBucket(size_t /* idx */) {
- VERIFY(bucket != NULL);
- TRACE("TRANSFERRING BUCKET" <<
- "BUCKET size=" << bucket->size());
-
- RawKMerStorage *ret = bucket;
- bucket = NULL;
-
- return ret;
- }
-
virtual RawKMerStorage* GetFinalKMers() {
OpenBucket(0);
VERIFY(bucket != NULL);
diff --git a/src/projects/cap/mosaic.hpp b/src/projects/cap/mosaic.hpp
index d6bdcb1..1939a3f 100644
--- a/src/projects/cap/mosaic.hpp
+++ b/src/projects/cap/mosaic.hpp
@@ -955,7 +955,7 @@ void DrawGraph(const vector<StrandRange>& all_ranges,
auto stream = io::RCWrap(StreamInstance(ExtractSequences(all_ranges, block_composition)));
auto streams = io::ReadStreamList<io::SingleRead>(stream);
// ConstructGraphUsingOldIndex(streams, gp.g, gp.index);
- ConstructGraph(config::debruijn_config::construction(), streams, gp.g, gp.index);
+ ConstructGraph(config::debruijn_config::construction(), omp_get_max_threads(), destreams, gp.g, gp.index);
auto full_mosaic_pos_stream = io::RCWrap(StreamInstance(ExtractSequences(full_mosaic_ranges, block_composition), mosaic_names(full_mosaic_ranges.size())));
INFO("Threading " << full_mosaic_ranges.size() << " full mosaics");
diff --git a/src/projects/hammer/expander.cpp b/src/projects/hammer/expander.cpp
index a088dc0..74e9fe2 100644
--- a/src/projects/hammer/expander.cpp
+++ b/src/projects/hammer/expander.cpp
@@ -17,11 +17,11 @@
#include <vector>
#include <cstring>
-bool Expander::operator()(const Read &r) {
- int trim_quality = cfg::get().input_trim_quality;
+bool Expander::operator()(std::unique_ptr<Read> r) {
+ uint8_t trim_quality = (uint8_t)cfg::get().input_trim_quality;
// FIXME: Get rid of this
- Read cr = r;
+ Read cr = *r;
size_t sz = cr.trimNsAndBadQuality(trim_quality);
if (sz < hammer::K)
@@ -29,7 +29,7 @@ bool Expander::operator()(const Read &r) {
std::vector<unsigned> covered_by_solid(sz, false);
std::vector<size_t> kmer_indices(sz, -1ull);
-
+
ValidKMerGenerator<hammer::K> gen(cr);
while (gen.HasMore()) {
hammer::KMer kmer = gen.kmer();
diff --git a/src/projects/hammer/expander.hpp b/src/projects/hammer/expander.hpp
index f788e12..377f2e5 100644
--- a/src/projects/hammer/expander.hpp
+++ b/src/projects/hammer/expander.hpp
@@ -12,6 +12,7 @@ class KMerData;
class Read;
#include <cstring>
+#include <memory>
class Expander {
KMerData &data_;
@@ -23,7 +24,7 @@ class Expander {
size_t changed() const { return changed_; }
- bool operator()(const Read &r);
+ bool operator()(std::unique_ptr<Read> r);
};
#endif
diff --git a/src/projects/hammer/kmer_cluster.cpp b/src/projects/hammer/kmer_cluster.cpp
index c1b6eb3..ff153c9 100644
--- a/src/projects/hammer/kmer_cluster.cpp
+++ b/src/projects/hammer/kmer_cluster.cpp
@@ -6,6 +6,8 @@
//***************************************************************************
#include "io/reads_io/ireadstream.hpp"
+#include "dev_support/openmp_wrapper.h"
+
#include "hammer_tools.hpp"
#include "hamcluster.hpp"
#include "kmer_cluster.hpp"
diff --git a/src/projects/hammer/kmer_data.cpp b/src/projects/hammer/kmer_data.cpp
index 160950a..16d1128 100644
--- a/src/projects/hammer/kmer_data.cpp
+++ b/src/projects/hammer/kmer_data.cpp
@@ -12,10 +12,11 @@
#include "io/reads_io/ireadstream.hpp"
#include "config_struct_hammer.hpp"
-#include "dev_support/file_limit.hpp"
+#include "data_structures/mph_index/kmer_index_builder.hpp"
-#include <libcxx/sort.hpp>
#include "io/kmers_io/kmer_iterator.hpp"
+#include "utils/adt/bf.hpp"
+#include "utils/adt/hll.hpp"
using namespace hammer;
@@ -34,165 +35,76 @@ struct KMerComparator {
};
-class HammerKMerSplitter : public KMerSplitter<hammer::KMer> {
- typedef std::vector<std::vector<KMer> > KMerBuffer;
+class HammerFilteringKMerSplitter : public KMerSortingSplitter<hammer::KMer> {
+ public:
+ typedef std::function<bool(const KMer&)> KMerFilter;
- void DumpBuffers(size_t num_files, size_t nthreads,
- std::vector<KMerBuffer> &buffers,
- const path::files_t &ostreams) const;
+ HammerFilteringKMerSplitter(std::string &work_dir,
+ KMerFilter filter = [](const KMer&) { return true; })
+ : KMerSortingSplitter<hammer::KMer>(work_dir, hammer::K),
+ filter_(std::move(filter)) {}
- public:
- HammerKMerSplitter(std::string &work_dir)
- : KMerSplitter<hammer::KMer>(work_dir, hammer::K) {}
+ path::files_t Split(size_t num_files) override;
- virtual path::files_t Split(size_t num_files);
+ private:
+ KMerFilter filter_;
friend class BufferFiller;
};
-void HammerKMerSplitter::DumpBuffers(size_t num_files, size_t nthreads,
- std::vector<KMerBuffer> &buffers,
- const path::files_t &ostreams) const {
-# pragma omp parallel for num_threads(nthreads)
- for (unsigned k = 0; k < num_files; ++k) {
- size_t sz = 0;
- for (size_t i = 0; i < nthreads; ++i)
- sz += buffers[i][k].size();
-
- if (!sz)
- continue;
-
- std::vector<KMer> SortBuffer;
- SortBuffer.reserve(sz);
- for (size_t i = 0; i < nthreads; ++i) {
- KMerBuffer &entry = buffers[i];
- SortBuffer.insert(SortBuffer.end(), entry[k].begin(), entry[k].end());
- }
- libcxx::sort(SortBuffer.begin(), SortBuffer.end(), KMerComparator());
- auto it = std::unique(SortBuffer.begin(), SortBuffer.end());
-
-# pragma omp critical
- {
- FILE *f = fopen(ostreams[k].c_str(), "ab");
- VERIFY_MSG(f, "Cannot open temporary file to write");
- fwrite(SortBuffer.data(), sizeof(KMer), it - SortBuffer.begin(), f);
- fclose(f);
- }
- }
-
- for (unsigned i = 0; i < nthreads; ++i) {
- for (unsigned j = 0; j < num_files; ++j) {
- buffers[i][j].clear();
- }
- }
-}
-
-
class BufferFiller {
- std::vector<HammerKMerSplitter::KMerBuffer> &tmp_entries_;
- unsigned num_files_;
- size_t cell_size_;
- size_t processed_;
- const HammerKMerSplitter &splitter_;
+ HammerFilteringKMerSplitter &splitter_;
public:
- BufferFiller(std::vector<HammerKMerSplitter::KMerBuffer> &tmp_entries, size_t cell_size, const HammerKMerSplitter &splitter):
- tmp_entries_(tmp_entries), num_files_((unsigned)tmp_entries[0].size()), cell_size_(cell_size), processed_(0), splitter_(splitter) {}
+ BufferFiller(HammerFilteringKMerSplitter &splitter)
+ : splitter_(splitter) {}
- size_t processed() const { return processed_; }
+ bool operator()(std::unique_ptr<Read> r) {
+ uint8_t trim_quality = (uint8_t)cfg::get().input_trim_quality;
- bool operator()(const Read &r) {
- int trim_quality = cfg::get().input_trim_quality;
-
- // FIXME: Get rid of this
- Read cr = r;
- size_t sz = cr.trimNsAndBadQuality(trim_quality);
-
- #pragma omp atomic
- processed_ += 1;
-
- if (sz < hammer::K)
- return false;
-
- HammerKMerSplitter::KMerBuffer &entry = tmp_entries_[omp_get_thread_num()];
- ValidKMerGenerator<hammer::K> gen(cr);
+ unsigned thread_id = omp_get_thread_num();
+ ValidKMerGenerator<hammer::K> gen(*r, trim_quality);
bool stop = false;
- while (gen.HasMore()) {
+ for (; gen.HasMore(); gen.Next()) {
KMer seq = gen.kmer();
- size_t idx = splitter_.GetFileNumForSeq(seq, num_files_);
- entry[idx].push_back(seq);
- stop |= entry[idx].size() > cell_size_;
+ if (!splitter_.filter_(seq))
+ continue;
- seq = !seq;
- idx = splitter_.GetFileNumForSeq(seq, num_files_);
- entry[idx].push_back(seq);
- stop |= entry[idx].size() > cell_size_;
-
- gen.Next();
+ stop |= splitter_.push_back_internal( seq, thread_id);
+ stop |= splitter_.push_back_internal(!seq, thread_id);
}
return stop;
}
};
-path::files_t HammerKMerSplitter::Split(size_t num_files) {
+path::files_t HammerFilteringKMerSplitter::Split(size_t num_files) {
unsigned nthreads = std::min(cfg::get().count_merge_nthreads, cfg::get().general_max_nthreads);
+ size_t reads_buffer_size = cfg::get().count_split_buffer;
INFO("Splitting kmer instances into " << num_files << " buckets. This might take a while.");
- // Determine the set of output files
- path::files_t out;
- for (unsigned i = 0; i < num_files; ++i)
- out.push_back(GetRawKMersFname(i));
+ path::files_t out = PrepareBuffers(num_files, nthreads, reads_buffer_size);
- size_t file_limit = num_files + 2*nthreads;
- size_t res = limit_file(file_limit);
- if (res < file_limit) {
- WARN("Failed to setup necessary limit for number of open files. The process might crash later on.");
- WARN("Do 'ulimit -n " << file_limit << "' in the console to overcome the limit");
- }
-
- size_t reads_buffer_size = cfg::get().count_split_buffer;
- if (reads_buffer_size == 0) {
- reads_buffer_size = 536870912ull;
- size_t mem_limit = (size_t)((double)(get_free_memory()) / (nthreads * 3));
- INFO("Memory available for splitting buffers: " << (double)mem_limit / 1024.0 / 1024.0 / 1024.0 << " Gb");
- reads_buffer_size = std::min(reads_buffer_size, mem_limit);
- }
- size_t cell_size = reads_buffer_size / (num_files * sizeof(KMer));
- // Set sane minimum cell size
- if (cell_size < 16384)
- cell_size = 16384;
-
- INFO("Using cell size of " << cell_size);
- std::vector<KMerBuffer> tmp_entries(nthreads);
- for (unsigned i = 0; i < nthreads; ++i) {
- KMerBuffer &entry = tmp_entries[i];
- entry.resize(num_files);
- for (unsigned j = 0; j < num_files; ++j) {
- entry[j].reserve((size_t)(1.1 * (double)cell_size));
- }
- }
-
- size_t n = 15;
- BufferFiller filler(tmp_entries, cell_size, *this);
- const auto& dataset = cfg::get().dataset;
- for (auto I = dataset.reads_begin(), E = dataset.reads_end(); I != E; ++I) {
- INFO("Processing " << *I);
- ireadstream irs(*I, cfg::get().input_qvoffset);
+ size_t n = 15, processed = 0;
+ BufferFiller filler(*this);
+ for (const auto &reads : cfg::get().dataset.reads()) {
+ INFO("Processing " << reads);
+ ireadstream irs(reads, cfg::get().input_qvoffset);
while (!irs.eof()) {
hammer::ReadProcessor rp(nthreads);
rp.Run(irs, filler);
- DumpBuffers(num_files, nthreads, tmp_entries, out);
+ DumpBuffers(out);
VERIFY_MSG(rp.read() == rp.processed(), "Queue unbalanced");
+ processed += rp.processed();
- if (filler.processed() >> n) {
- INFO("Processed " << filler.processed() << " reads");
+ if (processed >> n) {
+ INFO("Processed " << processed << " reads");
n += 1;
}
}
}
- INFO("Processed " << filler.processed() << " reads");
+ INFO("Total " << processed << " reads processed");
return out;
}
@@ -241,11 +153,11 @@ class KMerDataFiller {
KMerDataFiller(KMerData &data)
: data_(data) {}
- bool operator()(const Read &r) {
- int trim_quality = cfg::get().input_trim_quality;
+ bool operator()(std::unique_ptr<Read> r) {
+ uint8_t trim_quality = (uint8_t)cfg::get().input_trim_quality;
// FIXME: Get rid of this
- Read cr = r;
+ Read cr = *r;
size_t sz = cr.trimNsAndBadQuality(trim_quality);
if (sz < hammer::K)
@@ -268,245 +180,152 @@ class KMerDataFiller {
};
class KMerMultiplicityCounter {
- KMerData &data_;
- uint64_t *cnt_;
-
- void IncCount(const hammer::KMer &k) {
- size_t idx = data_.seq_idx(k);
- size_t block = idx * 2 / (8 * sizeof(uint64_t)), pos = (idx * 2) % (8 * sizeof(uint64_t));
- size_t mask = 3ull << pos;
-
- if (__sync_fetch_and_or(cnt_ + block, 1ull << pos) & mask)
- __sync_fetch_and_or(cnt_ + block, 2ull << pos);
- }
+ bf::bitcounting_bloom_filter<KMer, 2> bf_;
public:
- KMerMultiplicityCounter(KMerData &data)
- : data_(data) {
- size_t blocks = (2 * data.size()) / (8 * sizeof(uint64_t)) + 1;
- cnt_ = new uint64_t[blocks];
- memset(cnt_, 0, blocks * sizeof(uint64_t));
- }
- ~KMerMultiplicityCounter() { delete[] cnt_; }
-
-
- bool operator()(const Read &r) {
- int trim_quality = cfg::get().input_trim_quality;
-
- // FIXME: Get rid of this
- Read cr = r;
- size_t sz = cr.trimNsAndBadQuality(trim_quality);
+ KMerMultiplicityCounter(size_t size)
+ : bf_([](const KMer &k, uint64_t seed) { return k.GetHash((uint32_t)seed); },
+ 4 * size) {}
- if (sz < hammer::K)
- return false;
+ ~KMerMultiplicityCounter() {}
- ValidKMerGenerator<hammer::K> gen(cr);
- while (gen.HasMore()) {
- KMer kmer = gen.kmer();
+ bool operator()(std::unique_ptr<Read> r) {
+ uint8_t trim_quality = (uint8_t)cfg::get().input_trim_quality;
- IncCount(kmer);
- IncCount(!kmer);
+ ValidKMerGenerator<hammer::K> gen(*r, trim_quality);
+ for (; gen.HasMore(); gen.Next()) {
+ KMer kmer = gen.kmer();
- gen.Next();
- }
+ bf_.add(kmer);
+ bf_.add(!kmer);
+ }
- return false;
- }
+ return false;
+ }
- size_t count(size_t idx) const {
- size_t block = idx * 2 / (8 * sizeof(uint64_t)), pos = idx * 2 % (8 * sizeof(uint64_t));
- return (cnt_[block] >> pos) & 3;
- }
+ size_t count(const KMer &k) const {
+ return bf_.lookup(k);
+ }
};
-class NonSingletonKMerSplitter : public KMerSplitter<hammer::KMer> {
- typedef std::vector<std::vector<KMer> > KMerBuffer;
-
- std::pair<size_t, size_t>
- FillBufferFromStream(io::raw_kmer_iterator<hammer::KMer> &it,
- KMerBuffer &entry,
- size_t cell_size, size_t num_files) {
- size_t processed = 0, non_singleton = 0 ;
- for ( ; it.good(); ++it) {
- hammer::KMer seq(hammer::K, *it);
-
- size_t kidx = data_.seq_idx(seq);
- size_t cnt = counter_.count(kidx);
-
- processed += 1;
-
- if (cnt == 1)
- continue;
-
- non_singleton += 1;
-
- size_t idx = this->GetFileNumForSeq(seq, (unsigned)num_files);
- entry[idx].push_back(seq);
-
-
- if (entry[idx].size() > cell_size)
- break;
- }
- return std::make_pair(processed, non_singleton);
- }
-
- void DumpBuffers(size_t num_files, size_t nthreads,
- std::vector<KMerBuffer> &buffers,
- const path::files_t &ostreams) const {
-# pragma omp parallel for num_threads(nthreads)
- for (unsigned k = 0; k < num_files; ++k) {
- size_t sz = 0;
- for (size_t i = 0; i < nthreads; ++i)
- sz += buffers[i][k].size();
-
- if (!sz)
- continue;
-
- std::vector<KMer> SortBuffer;
- SortBuffer.reserve(sz);
- for (size_t i = 0; i < nthreads; ++i) {
- KMerBuffer &entry = buffers[i];
- SortBuffer.insert(SortBuffer.end(), entry[k].begin(), entry[k].end());
- }
- libcxx::sort(SortBuffer.begin(), SortBuffer.end(), KMerComparator());
- auto it = std::unique(SortBuffer.begin(), SortBuffer.end());
-
-# pragma omp critical
- {
- FILE *f = fopen(ostreams[k].c_str(), "ab");
- VERIFY_MSG(f, "Cannot open temporary file to write");
- fwrite(SortBuffer.data(), sizeof(KMer), it - SortBuffer.begin(), f);
- fclose(f);
- }
- }
-
- for (unsigned i = 0; i < nthreads; ++i) {
- for (unsigned j = 0; j < num_files; ++j) {
- buffers[i][j].clear();
- }
- }
- }
+class KMerCountEstimator {
+ std::vector<hll::hll<KMer>> hll_;
public:
- NonSingletonKMerSplitter(std::string &work_dir,
- const std::string &final_kmers,
- const KMerData &data,
- const KMerMultiplicityCounter &counter)
- : KMerSplitter<hammer::KMer>(work_dir, hammer::K), final_kmers_(final_kmers), data_(data), counter_(counter){}
-
- virtual path::files_t Split(size_t num_files) {
- unsigned nthreads = std::min(cfg::get().count_merge_nthreads, cfg::get().general_max_nthreads);
-
- INFO("Splitting kmer instances into " << num_files << " buckets. This might take a while.");
-
- // Determine the set of output files
- path::files_t out;
- for (unsigned i = 0; i < num_files; ++i)
- out.push_back(GetRawKMersFname(i));
-
- size_t file_limit = num_files + 2*nthreads;
- size_t res = limit_file(file_limit);
- if (res < file_limit) {
- WARN("Failed to setup necessary limit for number of open files. The process might crash later on.");
- WARN("Do 'ulimit -n " << file_limit << "' in the console to overcome the limit");
- }
+ KMerCountEstimator(unsigned thread_num) {
+ hll_.reserve(thread_num);
+ for (unsigned i = 0; i < thread_num; ++i)
+ hll_.emplace_back([](const KMer &k) { return k.GetHash(); });
+ }
- size_t reads_buffer_size = cfg::get().count_split_buffer;
- if (reads_buffer_size == 0) {
- reads_buffer_size = 536870912ull;
- size_t mem_limit = (size_t)((double)(get_free_memory()) / (nthreads * 3));
- INFO("Memory available for splitting buffers: " << (double)mem_limit / 1024.0 / 1024.0 / 1024.0 << " Gb");
- reads_buffer_size = std::min(reads_buffer_size, mem_limit);
- }
- size_t cell_size = reads_buffer_size / (num_files * sizeof(KMer));
- // Set sane minimum cell size
- if (cell_size < 16384)
- cell_size = 16384;
-
- INFO("Using cell size of " << cell_size);
- std::vector<KMerBuffer> tmp_entries(nthreads);
- for (unsigned i = 0; i < nthreads; ++i) {
- KMerBuffer &entry = tmp_entries[i];
- entry.resize(num_files);
- for (unsigned j = 0; j < num_files; ++j) {
- entry[j].reserve((size_t)(1.1 * (double)cell_size));
- }
- }
+ ~KMerCountEstimator() {}
- size_t n = 15;
- size_t total_kmers = 0, non_singletons = 0;
- auto kmers = io::make_kmer_iterator<hammer::KMer>(final_kmers_, hammer::K, nthreads);
- while (std::any_of(kmers.begin(), kmers.end(),
- [](const io::raw_kmer_iterator<hammer::KMer> &it) { return it.good(); })) {
-# pragma omp parallel for num_threads(nthreads) reduction(+ : total_kmers) reduction(+ : non_singletons)
- for (size_t i = 0; i < kmers.size(); ++i) {
- size_t kc, nsc;
- std::tie(kc, nsc) = FillBufferFromStream(kmers[i], tmp_entries[i], cell_size, num_files);
- total_kmers += kc;
- non_singletons += nsc;
- }
-
- DumpBuffers(num_files, nthreads, tmp_entries, out);
- if (total_kmers >> n) {
- INFO("Processed " << total_kmers << " kmers");
- n += 1;
- }
- }
- INFO("Processed " << total_kmers << " kmers");
+ bool operator()(std::unique_ptr<Read> r) {
+ uint8_t trim_quality = (uint8_t)cfg::get().input_trim_quality;
- INFO("Total " << non_singletons << " non-singleton k-mers written");
+ ValidKMerGenerator<hammer::K> gen(*r, trim_quality);
+ for (; gen.HasMore(); gen.Next()) {
+ KMer kmer = gen.kmer();
+ auto &hll = hll_[omp_get_thread_num()];
- unlink(final_kmers_.c_str());
+ hll.add(kmer);
+ hll.add(!kmer);
+ }
- return out;
- }
+ return false;
+ }
- private:
- const std::string final_kmers_;
- const KMerData &data_;
- const KMerMultiplicityCounter &counter_;
+ std::pair<double, bool> cardinality() const {
+ return hll_[0].cardinality();
+ }
+
+ void merge() {
+ for (size_t i = 1; i < hll_.size(); ++i) {
+ hll_[0].merge(hll_[i]);
+ hll_[i].clear();
+ }
+ }
};
void KMerDataCounter::BuildKMerIndex(KMerData &data) {
// Build the index
std::string workdir = cfg::get().input_working_dir;
- HammerKMerSplitter splitter(workdir);
- KMerDiskCounter<hammer::KMer> counter(workdir, splitter);
- size_t kmers = KMerIndexBuilder<HammerKMerIndex>(workdir, num_files_, omp_get_max_threads()).BuildIndex(data.index_, counter, /* save final */ true);
- std::string final_kmers = counter.GetFinalKMersFname();
// Optionally perform a filtering step
+ size_t kmers = 0;
+ std::string final_kmers;
if (cfg::get().count_filter_singletons) {
- INFO("Filtering singleton k-mers");
- data.kmers_.set_size(kmers);
- KMerMultiplicityCounter mcounter(data);
-
- const auto& dataset = cfg::get().dataset;
- for (auto I = dataset.reads_begin(), E = dataset.reads_end(); I != E; ++I) {
- INFO("Processing " << *I);
- ireadstream irs(*I, cfg::get().input_qvoffset);
- hammer::ReadProcessor rp(omp_get_max_threads());
- rp.Run(irs, mcounter);
- VERIFY_MSG(rp.read() == rp.processed(), "Queue unbalanced");
+ size_t buffer_size;
+ {
+ INFO("Estimating k-mer count");
+
+ size_t n = 15, processed = 0;
+ KMerCountEstimator mcounter(omp_get_max_threads());
+ for (const auto &reads : cfg::get().dataset.reads()) {
+ INFO("Processing " << reads);
+ ireadstream irs(reads, cfg::get().input_qvoffset);
+ while (!irs.eof()) {
+ hammer::ReadProcessor rp(omp_get_max_threads());
+ rp.Run(irs, mcounter);
+ VERIFY_MSG(rp.read() == rp.processed(), "Queue unbalanced");
+ processed += rp.processed();
+
+ if (processed >> n) {
+ INFO("Processed " << processed << " reads");
+ n += 1;
+ }
+ }
+ }
+ INFO("Total " << processed << " reads processed");
+ mcounter.merge();
+ std::pair<double, bool> res = mcounter.cardinality();
+ if (res.second == false) {
+ buffer_size = cfg::get().count_split_buffer;
+ if (buffer_size == 0) buffer_size = 512ull * 1024 * 1024;
+ } else {
+ INFO("Estimated " << size_t(res.first) << " distinct kmers");
+ buffer_size = 3 * size_t(res.first);
+ }
}
- size_t singletons = 0;
- for (size_t idx = 0; idx < data.size(); ++idx) {
- size_t cnt = mcounter.count(idx);
- VERIFY(cnt);
- singletons += cnt == 1;
+ INFO("Filtering singleton k-mers");
+
+ KMerMultiplicityCounter mcounter(buffer_size);
+
+ size_t n = 15, processed = 0;
+ for (const auto &reads : cfg::get().dataset.reads()) {
+ INFO("Processing " << reads);
+ ireadstream irs(reads, cfg::get().input_qvoffset);
+ while (!irs.eof()) {
+ hammer::ReadProcessor rp(omp_get_max_threads());
+ rp.Run(irs, mcounter);
+ VERIFY_MSG(rp.read() == rp.processed(), "Queue unbalanced");
+ processed += rp.processed();
+
+ if (processed >> n) {
+ INFO("Processed " << processed << " reads");
+ n += 1;
+ }
+ }
}
- INFO("There are " << data.size() << " kmers in total. "
- "Among them " << data.size() - singletons << " (" << 100.0 * (double)(data.size() - singletons) / (double)data.size() << "%) are non-singletons.");
-
- NonSingletonKMerSplitter nssplitter(workdir, final_kmers, data, mcounter);
- KMerDiskCounter<hammer::KMer> nscounter(workdir, nssplitter);
- HammerKMerIndex reduced_index;
- kmers = KMerIndexBuilder<HammerKMerIndex>(workdir, num_files_, omp_get_max_threads()).BuildIndex(reduced_index, nscounter, /* save final */ true);
- data.index_.swap(reduced_index);
- final_kmers = nscounter.GetFinalKMersFname();
+ INFO("Total " << processed << " reads processed");
+
+ // FIXME: Reduce code duplication
+ HammerFilteringKMerSplitter splitter(workdir,
+ [&] (const KMer &k) { return mcounter.count(k) > 1; });
+ KMerDiskCounter<hammer::KMer> counter(workdir, splitter);
+
+ kmers = KMerIndexBuilder<HammerKMerIndex>(workdir, num_files_, omp_get_max_threads()).BuildIndex(data.index_, counter, /* save final */ true);
+ final_kmers = counter.GetFinalKMersFname();
+ } else {
+ HammerFilteringKMerSplitter splitter(workdir);
+ KMerDiskCounter<hammer::KMer> counter(workdir, splitter);
+
+ kmers = KMerIndexBuilder<HammerKMerIndex>(workdir, num_files_, omp_get_max_threads()).BuildIndex(data.index_, counter, /* save final */ true);
+ final_kmers = counter.GetFinalKMersFname();
}
+
// Check, whether we'll ever have enough memory for running BH and bail out earlier
double needed = 1.25 * (double)kmers * (sizeof(KMerStat) + sizeof(hammer::KMer));
if (needed > (double) get_memory_limit())
@@ -531,7 +350,7 @@ void KMerDataCounter::BuildKMerIndex(KMerData &data) {
}
}
- unlink(counter.GetFinalKMersFname().c_str());
+ unlink(final_kmers.c_str());
}
}
diff --git a/src/projects/ionhammer/hamcluster.hpp b/src/projects/ionhammer/hamcluster.hpp
index 019404f..23b7015 100644
--- a/src/projects/ionhammer/hamcluster.hpp
+++ b/src/projects/ionhammer/hamcluster.hpp
@@ -67,7 +67,6 @@ public:
class SubKMerStridedSerializer{
size_t from_;
- size_t to_;
size_t stride_;
public:
diff --git a/src/projects/ionhammer/kmer_data.cpp b/src/projects/ionhammer/kmer_data.cpp
index 3aae09d..9b82792 100644
--- a/src/projects/ionhammer/kmer_data.cpp
+++ b/src/projects/ionhammer/kmer_data.cpp
@@ -9,99 +9,49 @@
#include "config_struct.hpp"
#include "valid_hkmer_generator.hpp"
+#include "data_structures/mph_index/kmer_index_builder.hpp"
+
#include "io/kmers_io/mmapped_writer.hpp"
#include "io/reads_io/file_reader.hpp"
#include "io/reads_io/read_processor.hpp"
-#include <libcxx/sort.hpp>
-
using namespace hammer;
class BufferFiller;
-class HammerKMerSplitter : public KMerSplitter<hammer::HKMer> {
- typedef std::vector<std::vector<HKMer> > KMerBuffer;
-
- void DumpBuffers(size_t num_files, size_t nthreads,
- std::vector<KMerBuffer> &buffers,
- const path::files_t &ostreams) const;
-
+class HammerKMerSplitter : public KMerSortingSplitter<HKMer> {
public:
HammerKMerSplitter(const std::string &work_dir)
- : KMerSplitter<hammer::HKMer>(work_dir, hammer::K) {}
+ : KMerSortingSplitter<HKMer>(work_dir, hammer::K) {}
- virtual path::files_t Split(size_t num_files);
+ path::files_t Split(size_t num_files) override;
friend class BufferFiller;
};
-void HammerKMerSplitter::DumpBuffers(size_t num_files, size_t nthreads,
- std::vector<KMerBuffer> &buffers,
- const path::files_t &ostreams) const {
-# pragma omp parallel for num_threads(nthreads)
- for (unsigned k = 0; k < num_files; ++k) {
- size_t sz = 0;
- for (size_t i = 0; i < nthreads; ++i)
- sz += buffers[i][k].size();
-
- std::vector<HKMer> SortBuffer;
- SortBuffer.reserve(sz);
- for (size_t i = 0; i < nthreads; ++i) {
- KMerBuffer &entry = buffers[i];
- SortBuffer.insert(SortBuffer.end(), entry[k].begin(), entry[k].end());
- }
- libcxx::sort(SortBuffer.begin(), SortBuffer.end(), HKMer::less2_fast());
- auto it = std::unique(SortBuffer.begin(), SortBuffer.end());
-
-# pragma omp critical
- {
- FILE *f = fopen(ostreams[k].c_str(), "ab");
- VERIFY_MSG(f, "Cannot open temporary file to write");
- fwrite(SortBuffer.data(), sizeof(HKMer), it - SortBuffer.begin(), f);
- fclose(f);
- }
- }
-
- for (unsigned i = 0; i < nthreads; ++i) {
- for (unsigned j = 0; j < num_files; ++j) {
- buffers[i][j].clear();
- }
- }
-}
-
class BufferFiller {
- std::vector<HammerKMerSplitter::KMerBuffer> &tmp_entries_;
- unsigned num_files_;
- size_t cell_size_;
size_t processed_;
- const HammerKMerSplitter &splitter_;
+ HammerKMerSplitter &splitter_;
public:
- BufferFiller(std::vector<HammerKMerSplitter::KMerBuffer> &tmp_entries, size_t cell_size, const HammerKMerSplitter &splitter):
- tmp_entries_(tmp_entries), num_files_((unsigned)tmp_entries[0].size()), cell_size_(cell_size), processed_(0), splitter_(splitter) {}
+ BufferFiller(HammerKMerSplitter &splitter)
+ : processed_(0), splitter_(splitter) {}
size_t processed() const { return processed_; }
- bool operator()(const io::SingleRead &r) {
- ValidHKMerGenerator<hammer::K> gen(r);
- HammerKMerSplitter::KMerBuffer &entry = tmp_entries_[omp_get_thread_num()];
+ bool operator()(std::unique_ptr<io::SingleRead> r) {
+ ValidHKMerGenerator<hammer::K> gen(*r);
+ unsigned thread_id = omp_get_thread_num();
# pragma omp atomic
processed_ += 1;
bool stop = false;
while (gen.HasMore()) {
- HKMer seq = gen.kmer(); size_t idx;
+ HKMer seq = gen.kmer();
- idx = splitter_.GetFileNumForSeq(seq, num_files_);
- entry[idx].push_back(seq);
- stop |= entry[idx].size() > cell_size_;
-
- seq = !seq;
-
- idx = splitter_.GetFileNumForSeq(seq, num_files_);
- entry[idx].push_back(seq);
- stop |= entry[idx].size() > cell_size_;
+ stop |= splitter_.push_back_internal( seq, thread_id);
+ stop |= splitter_.push_back_internal(!seq, thread_id);
gen.Next();
}
@@ -112,46 +62,21 @@ class BufferFiller {
path::files_t HammerKMerSplitter::Split(size_t num_files) {
unsigned nthreads = cfg::get().max_nthreads;
+ size_t reads_buffer_size = cfg::get().count_split_buffer;
INFO("Splitting kmer instances into " << num_files << " buckets. This might take a while.");
- // Determine the set of output files
- path::files_t out;
- for (unsigned i = 0; i < num_files; ++i)
- out.push_back(GetRawKMersFname(i));
-
- size_t reads_buffer_size = cfg::get().count_split_buffer;
- if (reads_buffer_size == 0) {
- reads_buffer_size = 536870912ull;
- size_t mem_limit = (size_t)((double)(get_free_memory()) / (nthreads * 3));
- INFO("Memory available for splitting buffers: " << (double)mem_limit / 1024.0 / 1024.0 / 1024.0 << " Gb");
- reads_buffer_size = std::min(reads_buffer_size, mem_limit);
- }
- size_t cell_size = reads_buffer_size / (num_files * sizeof(HKMer));
- // Set sane minimum cell size
- if (cell_size < 16384)
- cell_size = 16384;
-
- INFO("Using cell size of " << cell_size);
- std::vector<KMerBuffer> tmp_entries(nthreads);
- for (unsigned i = 0; i < nthreads; ++i) {
- KMerBuffer &entry = tmp_entries[i];
- entry.resize(num_files);
- for (unsigned j = 0; j < num_files; ++j) {
- entry[j].reserve((size_t)(1.1 * (double)cell_size));
- }
- }
+ path::files_t out = PrepareBuffers(num_files, nthreads, reads_buffer_size);
size_t n = 15;
- const auto& dataset = cfg::get().dataset;
- BufferFiller filler(tmp_entries, cell_size, *this);
- for (auto it = dataset.reads_begin(), et = dataset.reads_end(); it != et; ++it) {
- INFO("Processing " << *it);
- io::FileReadStream irs(*it, io::PhredOffset);
+ BufferFiller filler(*this);
+ for (const auto &reads : cfg::get().dataset.reads()) {
+ INFO("Processing " << reads);
+ io::FileReadStream irs(reads, io::PhredOffset);
hammer::ReadProcessor rp(nthreads);
while (!irs.eof()) {
rp.Run(irs, filler);
- DumpBuffers(num_files, nthreads, tmp_entries, out);
+ DumpBuffers(out);
VERIFY_MSG(rp.read() == rp.processed(), "Queue unbalanced");
if (filler.processed() >> n) {
@@ -196,8 +121,8 @@ class KMerDataFiller {
KMerDataFiller(KMerData &data)
: data_(data) {}
- bool operator()(const io::SingleRead &r) const {
- ValidHKMerGenerator<hammer::K> gen(r);
+ bool operator()(std::unique_ptr<io::SingleRead> r) const {
+ ValidHKMerGenerator<hammer::K> gen(*r);
while (gen.HasMore()) {
HKMer kmer = gen.kmer();
double correct = gen.correct_probability();
diff --git a/src/projects/ionhammer/read_corrector.hpp b/src/projects/ionhammer/read_corrector.hpp
index 952972c..def12aa 100644
--- a/src/projects/ionhammer/read_corrector.hpp
+++ b/src/projects/ionhammer/read_corrector.hpp
@@ -318,11 +318,9 @@ class HKMerProlonger {
/// @param[in] seed kmer to prolong
/// @param[in] bases_to_recover maximum number of bases to recover
- /// @param[in] side side to prolong to (RightSide/LeftSide)
template <typename Side>
std::deque<hammer::HomopolymerRun> prolong(const hammer::HKMer &seed,
- size_t bases_to_recover,
- Side side) {
+ size_t bases_to_recover) {
std::deque<hammer::HomopolymerRun> good_runs(hammer::K);
for (size_t i = 0; i < hammer::K; ++i)
good_runs[i] = seed[i];
@@ -1128,8 +1126,12 @@ public:
kmer_data_(kmer_data), sam_header_(NULL),
debug_pred_(debug), select_pred_(select) {}
- boost::optional<io::SingleRead> operator()(const io::SingleRead &r) {
- if (!select_pred_(r))return boost::optional<io::SingleRead>();
+ std::unique_ptr<io::SingleRead> operator()(std::unique_ptr<io::SingleRead> r) {
+ return operator()(*r);
+ }
+
+ std::unique_ptr<io::SingleRead> operator()(const io::SingleRead &r) {
+ if (!select_pred_(r)) return nullptr;
bool debug_mode = debug_pred_(r);
if (debug_mode) {
std::cerr << "=============================================" << std::endl;
@@ -1149,28 +1151,28 @@ public:
auto seq = read.GetSequenceString();
if (seq.empty())
- return boost::optional<io::SingleRead>();
+ return nullptr;
- return io::SingleRead(r.name(), seq);
+ return std::unique_ptr<io::SingleRead>(new io::SingleRead(r.name(), seq));
}
- boost::optional<io::BamRead>
- operator()(BamTools::BamAlignment &alignment) {
+ std::unique_ptr<io::BamRead>
+ operator()(std::unique_ptr<BamTools::BamAlignment> alignment) {
VERIFY(sam_header_);
- io::SingleRead r(alignment.Name, alignment.QueryBases);
+ io::SingleRead r(alignment->Name, alignment->QueryBases);
// reverse strand means we're working with a mapped BAM, might be
// the case for datasets downloaded from IonCommunity
- if (alignment.IsReverseStrand())
+ if (alignment->IsReverseStrand())
r = !r;
auto corrected_r = operator()(r);
std::string rg;
- if (!alignment.GetTag("RG", rg) || !corrected_r)
- return boost::optional<io::BamRead>();
+ if (!alignment->GetTag("RG", rg) || !corrected_r)
+ return nullptr;
auto flow_order = sam_header_->ReadGroups[rg].FlowOrder;
float delta_score, fit_score;
- auto seq = corrected_r.get().GetSequenceString();
- if (alignment.IsReverseStrand()) {
+ auto seq = corrected_r->GetSequenceString();
+ if (alignment->IsReverseStrand()) {
std::reverse(seq.begin(), seq.end());
for (auto it = seq.begin(); it != seq.end(); ++it) {
switch (*it) {
@@ -1183,17 +1185,17 @@ public:
}
}
- BaseHypothesisEvaluator(alignment, flow_order, seq,
+ BaseHypothesisEvaluator(*alignment, flow_order, seq,
delta_score, fit_score, 0);
std::stringstream ss;
- ss << alignment.Name << "_" << delta_score << "_" << fit_score;
- alignment.Name = ss.str();
+ ss << alignment->Name << "_" << delta_score << "_" << fit_score;
+ alignment->Name = ss.str();
if (delta_score >= cfg::get().delta_score_threshold)
- return io::BamRead(alignment);
+ return std::unique_ptr<io::BamRead>(new io::BamRead(*alignment));
- BamTools::BamAlignment corrected(alignment);
- corrected.QueryBases = corrected_r.get().GetSequenceString();
- return io::BamRead(corrected);
+ BamTools::BamAlignment corrected(*alignment);
+ corrected.QueryBases = corrected_r->GetSequenceString();
+ return std::unique_ptr<io::BamRead>(new io::BamRead(corrected));
}
};
@@ -1204,14 +1206,14 @@ class PairedReadCorrector : public SingleReadCorrector {
SelectPredicate &select)
: SingleReadCorrector(kmer_data, debug, select) {}
- boost::optional<io::PairedRead> operator()(const io::PairedRead &r) {
- auto corrected_r = SingleReadCorrector::operator()(r.first());
- auto corrected_l = SingleReadCorrector::operator()(r.second());
+ std::unique_ptr<io::PairedRead> operator()(std::unique_ptr<io::PairedRead> r) {
+ auto corrected_r = SingleReadCorrector::operator()(r->first());
+ auto corrected_l = SingleReadCorrector::operator()(r->second());
if (!corrected_r || !corrected_l)
- return boost::optional<io::PairedRead>();
+ return nullptr;
- return io::PairedRead(corrected_r.get(), corrected_l.get(), 0);
+ return std::unique_ptr<io::PairedRead>(new io::PairedRead(*corrected_r, *corrected_l, 0));
}
};
diff --git a/src/projects/mph_test/CMakeLists.txt b/src/projects/mph_test/CMakeLists.txt
new file mode 100644
index 0000000..0d0a3d8
--- /dev/null
+++ b/src/projects/mph_test/CMakeLists.txt
@@ -0,0 +1,15 @@
+############################################################################
+# Copyright (c) 2016 Saint Petersburg State University
+# All Rights Reserved
+# See file LICENSE for details.
+############################################################################
+
+project(spades-kmercount CXX)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
+
+add_executable(spades-kmercount
+ main.cpp)
+
+target_link_libraries(spades-kmercount spades_modules ${COMMON_LIBRARIES})
diff --git a/src/projects/mph_test/main.cpp b/src/projects/mph_test/main.cpp
new file mode 100644
index 0000000..c638f77
--- /dev/null
+++ b/src/projects/mph_test/main.cpp
@@ -0,0 +1,184 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "dev_support/logger/log_writers.hpp"
+#include "dev_support/segfault_handler.hpp"
+#include "data_structures/indices/perfect_hash_map.hpp"
+#include "data_structures/sequence/runtime_k.hpp"
+#include "data_structures/mph_index/kmer_index_builder.hpp"
+
+#include "io/reads_io/read_processor.hpp"
+#include "io/reads_io/io_helper.hpp"
+
+#include "version.hpp"
+
+#include <cxxopts/cxxopts.hpp>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <string>
+
+using namespace std;
+void create_console_logger() {
+ using namespace logging;
+
+ logger *lg = create_logger("");
+ lg->add_writer(std::make_shared<console_writer>());
+ attach_logger(lg);
+}
+
+class SimplePerfectHashMap : public debruijn_graph::KeyIteratingMap<runtime_k::RtSeq, uint32_t> {
+ using base = debruijn_graph::KeyIteratingMap<runtime_k::RtSeq, uint32_t>;
+ public:
+ SimplePerfectHashMap(size_t k, const std::string &workdir)
+ : base(k, workdir) {}
+};
+
+class ParallelSortingSplitter : public KMerSortingSplitter<runtime_k::RtSeq> {
+ using Seq = runtime_k::RtSeq;
+
+ std::vector<std::string> files_;
+ unsigned nthreads_;
+ size_t read_buffer_size_;
+
+ class BufferFiller {
+ size_t processed_;
+ ParallelSortingSplitter &splitter_;
+ unsigned K_;
+
+ public:
+ BufferFiller(ParallelSortingSplitter &splitter, unsigned K)
+ : processed_(0), splitter_(splitter), K_(K) {}
+
+ size_t processed() const { return processed_; }
+
+ bool operator()(std::unique_ptr<io::SingleRead> r) {
+# pragma omp atomic
+ processed_ += 1;
+
+ const Sequence &seq = r->sequence();
+
+ if (seq.size() < this->K_)
+ return false;
+
+ unsigned thread_id = omp_get_thread_num();
+ bool stop = false;
+ runtime_k::RtSeq kmer = seq.start<runtime_k::RtSeq>(this->K_) >> 'A';
+ for (size_t j = this->K_ - 1; j < seq.size(); ++j) {
+ kmer <<= seq[j];
+ stop |= splitter_.push_back_internal(kmer, thread_id);
+ }
+
+ return stop;
+ }
+ };
+
+
+ public:
+ ParallelSortingSplitter(const std::string &workdir, unsigned K, unsigned nthreads, size_t read_buffer_size = 0)
+ : KMerSortingSplitter<Seq>(workdir, K), nthreads_(nthreads), read_buffer_size_(read_buffer_size) {}
+
+ void push_back(const std::string &filename) {
+ files_.push_back(filename);
+ }
+
+ path::files_t Split(size_t num_files) override {
+ INFO("Splitting kmer instances into " << num_files << " buckets. This might take a while.");
+
+ path::files_t out = PrepareBuffers(num_files, nthreads_, read_buffer_size_);
+
+ size_t n = 10;
+ BufferFiller filler(*this, K());
+ for (const auto &file : files_) {
+ INFO("Processing " << file);
+ auto irs = io::EasyStream(file, true, true);
+ while (!irs->eof()) {
+ hammer::ReadProcessor rp(nthreads_);
+ rp.Run(*irs, filler);
+ DumpBuffers(out);
+ VERIFY_MSG(rp.read() == rp.processed(), "Queue unbalanced");
+
+ if (filler.processed() >> n) {
+ INFO("Processed " << filler.processed() << " reads");
+ n += 1;
+ }
+ }
+ }
+ INFO("Total " << filler.processed() << " reads processed");
+
+ return out;
+ }
+};
+
+int main(int argc, char* argv[]) {
+ perf_counter pc;
+
+ srand(42);
+ srandom(42);
+ try {
+ unsigned nthreads;
+ unsigned K;
+ std::string workdir, dataset;
+ std::vector<std::string> input;
+ size_t read_buffer_size;
+
+ cxxopts::Options options(argv[0], " <input files> - SPAdes k-mer counting engine");
+ options.add_options()
+ ("k,kmer", "K-mer length", cxxopts::value<unsigned>(K)->default_value("21"), "K")
+ ("d,dataset", "Dataset description (in YAML), input files ignored", cxxopts::value<std::string>(dataset), "file")
+ ("t,threads", "# of threads to use", cxxopts::value<unsigned>(nthreads)->default_value(std::to_string(omp_get_max_threads())), "num")
+ ("w,workdir", "Working directory to use", cxxopts::value<std::string>(workdir)->default_value("."), "dir")
+ ("b,bufsize", "Sorting buffer size, per thread", cxxopts::value<size_t>(read_buffer_size)->default_value("536870912"))
+ ("h,help", "Print help");
+
+ options.add_options("Input")
+ ("positional", "", cxxopts::value<std::vector<std::string>>(input));
+
+ options.parse_positional("positional");
+ options.parse(argc, argv);
+ if (options.count("help")) {
+ std::cout << options.help() << std::endl;
+ exit(0);
+ }
+
+ if (!options.count("positional") && !options.count("dataset")) {
+ std::cerr << "ERROR: No input files were specified" << std::endl << std::endl;
+ std::cout << options.help() << std::endl;
+ exit(-1);
+ }
+
+ create_console_logger();
+
+ INFO("Starting SPAdes k-mer counting engine, built from " SPADES_GIT_REFSPEC ", git revision " SPADES_GIT_SHA1);
+
+ INFO("K-mer length set to " << K);
+ INFO("# of threads to use: " << nthreads);
+
+ SimplePerfectHashMap index(K, workdir);
+ ParallelSortingSplitter splitter(workdir, K, nthreads, read_buffer_size);
+ if (options.count("dataset")) {
+ io::DataSet<> idataset;
+ idataset.load(dataset);
+ for (const auto &s : idataset.reads())
+ splitter.push_back(s);
+ } else {
+ for (const auto& s : input)
+ splitter.push_back(s);
+ }
+ KMerDiskCounter<runtime_k::RtSeq> counter(workdir, splitter);
+ counter.CountAll(16, nthreads);
+ INFO("K-mer counting done, kmers saved to " << counter.GetFinalKMersFname());
+ } catch (std::string const &s) {
+ std::cerr << s;
+ return EINTR;
+ } catch (const cxxopts::OptionException &e) {
+ std::cerr << "error parsing options: " << e.what() << std::endl;
+ exit(1);
+ }
+
+ return 0;
+}
diff --git a/src/projects/online_vis/debruijn_online_visualizer.hpp b/src/projects/online_vis/debruijn_online_visualizer.hpp
index 63ff0a7..698d7c8 100644
--- a/src/projects/online_vis/debruijn_online_visualizer.hpp
+++ b/src/projects/online_vis/debruijn_online_visualizer.hpp
@@ -37,7 +37,7 @@ class DebruijnOnlineVisualizer : public OnlineVisualizer<DebruijnEnvironment> {
AddCommand(make_shared<DrawConnectedCommand>());
AddCommand(make_shared<ShowPositionCommand>());
AddCommand(make_shared<DrawMisassemblies>());
-
+ AddCommand(make_shared<DrawCoverageDropsCommand>());
AddCommand(make_shared<PrintPathsCommand>());
AddCommand(make_shared<PrintContigsStatsCommand>());
AddCommand(make_shared<JunctionSequenceCommand>());
diff --git a/src/projects/online_vis/drawing_commands/draw_contig_command.hpp b/src/projects/online_vis/drawing_commands/draw_contig_command.hpp
index c8baba1..37b90b9 100644
--- a/src/projects/online_vis/drawing_commands/draw_contig_command.hpp
+++ b/src/projects/online_vis/drawing_commands/draw_contig_command.hpp
@@ -58,11 +58,11 @@ public:
if (!CheckFileExists(contigs_file))
return;
- auto reader = make_shared<io::FixingWrapper>(make_shared<io::FileReadStream>(contigs_file));
+ io::FileReadStream reader(contigs_file);
- while (!reader->eof()) {
+ while (!reader.eof()) {
io::SingleRead read;
- (*reader) >> read;
+ reader >> read;
//LOG("Contig " << read.name() << " is being processed now");
// if the name contains a given string <contig_name> as a substring.
@@ -112,11 +112,11 @@ public:
if (!CheckFileExists(contigs_file))
return;
- auto reader = make_shared<io::FixingWrapper>(make_shared<io::FileReadStream>(contigs_file));
+ io::FileReadStream reader(contigs_file);
- while (!reader->eof()) {
+ while (!reader.eof()) {
io::SingleRead read;
- (*reader) >> read;
+ reader >> read;
//LOG("Contig " << read.name() << " is being processed now");
DrawPicturesAlongContig(curr_env, read);
diff --git a/src/projects/online_vis/drawing_commands/draw_missasemblies.hpp b/src/projects/online_vis/drawing_commands/draw_missasemblies.hpp
index f123b87..c3b2011 100644
--- a/src/projects/online_vis/drawing_commands/draw_missasemblies.hpp
+++ b/src/projects/online_vis/drawing_commands/draw_missasemblies.hpp
@@ -178,32 +178,23 @@ public:
}
string file = args[1];
- auto reader = make_shared<io::FixingWrapper>(make_shared<io::FileReadStream>(file));
- FillerClass& filler = curr_env.filler();
- while (!reader->eof()) {
- io::SingleRead read;
- (*reader) >> read;
- Sequence contig = read.sequence();
- filler.Process(contig, "miss_" + read.name());
- filler.Process(!contig, "miss_" + read.name() + "_RC");
- }
- reader->close();
+
+ FillPos(curr_env.graph_pack(), file, "miss", true);
cout << "All contigs are mapped" << endl;
- reader = make_shared<io::FixingWrapper>(make_shared<io::FileReadStream>(file));
+
auto genome_mapping_path = curr_env.mapper().MapSequence(curr_env.genome());
auto rc_genome_mapping_path = curr_env.mapper().MapSequence(!curr_env.genome());
cout << "Genome is mapped" << endl;
- while(!reader->eof()) {
+ io::FileReadStream reader(file);
+ while(reader.eof()) {
io::SingleRead read;
- (*reader) >> read;
- Sequence contig = read.sequence();
- cout << "Read " << read.name() << " is processed." << endl;
-
- auto mapping_path = curr_env.mapper().MapSequence(contig);
+ reader >> read;
+ auto mapping_path = curr_env.mapper().MapRead(read);
ProcessContig(curr_env, genome_mapping_path, rc_genome_mapping_path, mapping_path, read.name());
+ cout << "Read " << read.name() << " is processed." << endl;
}
}
diff --git a/src/projects/online_vis/drawing_commands/draw_poorly_assembled.hpp b/src/projects/online_vis/drawing_commands/draw_poorly_assembled.hpp
index 2044e6a..23c69ed 100644
--- a/src/projects/online_vis/drawing_commands/draw_poorly_assembled.hpp
+++ b/src/projects/online_vis/drawing_commands/draw_poorly_assembled.hpp
@@ -448,12 +448,12 @@ public:
contig_cnt = std::stoll(args[4]);
}
- auto reader = make_shared<io::FixingWrapper>(make_shared<io::FileReadStream>(contigs_file));
+ io::FileReadStream reader(contigs_file);
size_t i = 0;
- while (!reader->eof() && i < contig_cnt) {
+ while (!reader.eof() && i < contig_cnt) {
io::SingleRead contig;
- (*reader) >> contig;
+ reader >> contig;
LOG("Considering contig " << contig.name());
if (AnalyzeGaps(curr_env, contig, base_assembly_prefix,
@@ -585,6 +585,7 @@ public:
if (!CheckFileExists(contigs_file)) {
LOG("File with contigs " << contigs_file << " not found");
+ return;
}
size_t contig_cnt = -1u;
@@ -593,12 +594,12 @@ public:
contig_cnt = std::stoll(args[3]);
}
- auto reader = make_shared<io::FixingWrapper>(make_shared<io::FileReadStream>(contigs_file));
+ io::FileReadStream reader(contigs_file);
size_t i = 0;
- while (!reader->eof() && i < contig_cnt) {
+ while (!reader.eof() && i < contig_cnt) {
io::SingleRead contig;
- (*reader) >> contig;
+ reader >> contig;
LOG("Considering contig " << contig.name());
if (IsPoorlyAssembled(curr_env.graph_pack(), contig, base_assembly_prefix)) {
@@ -614,4 +615,126 @@ public:
}
};
+
+class DrawCoverageDropsCommand : public DrawingCommand {
+ const size_t cov_drop = 25;
+ const size_t min_ende_len = 2000;
+private:
+
+ bool IsRepeat(const GraphPack& gp, EdgeId e) const {
+ auto v1 = gp.g.EdgeStart(e);
+ auto v2 = gp.g.EdgeEnd(e);
+ return gp.g.IncomingEdgeCount(v1) >= 2 || gp.g.OutgoingEdgeCount(v2) >= 2 ;
+ }
+
+ std::vector<std::vector<EdgeId>> Split(const GraphPack& gp, std::vector<EdgeId> mapping_path) const {
+ std::vector<std::vector<EdgeId>> answer;
+ std::vector<EdgeId> temp;
+ for(auto e : mapping_path) {
+
+ if(gp.g.OutgoingEdgeCount(gp.g.EdgeEnd(e)) == 0) {
+ temp.push_back(e);
+ answer.push_back(temp);
+ temp.clear();
+ continue;
+ }
+
+ if(gp.g.IncomingEdgeCount(gp.g.EdgeStart(e)) == 0) {
+ answer.push_back(temp);
+ temp.clear();
+ temp.push_back(e);
+ continue;
+ }
+ temp.push_back(e);
+ }
+ if(temp.size() > 0) {
+ answer.push_back(temp);
+ }
+ return answer;
+ }
+
+ bool HasCoverageDrops(const GraphPack& gp, std::vector<EdgeId> mapping_path) const {
+ double min_coverage = std::numeric_limits<double>::max();
+ double max_coverage = 0;
+
+ std::for_each(mapping_path.begin(), mapping_path.end(), [this, &max_coverage, &min_coverage, &gp](EdgeId e){
+ if(!IsRepeat(gp, e) && gp.g.length(e) > min_ende_len) {
+ min_coverage = std::min(gp.g.coverage(e), min_coverage);
+ max_coverage = std::max(gp.g.coverage(e), max_coverage);
+ }
+ });
+ if(max_coverage > min_coverage && max_coverage - min_coverage > cov_drop) {
+ return true;
+ }
+ return false;
+ }
+
+protected:
+ size_t MinArgNumber() const {
+ return 1;
+ }
+
+ bool CheckCorrectness(const vector<string>& args) const {
+ if (!CheckEnoughArguments(args))
+ return false;
+ return true;
+ }
+
+public:
+ string Usage() const {
+ string answer;
+ answer = answer + "Command `draw_coverage_drops` \n" + "Usage:\n"
+ + "> draw_coverage_drops <contigs_file> [first N contigs to analyze]\n"
+ + " Draws pictures of contigs that have substantial coverage drops during theirs alignments to the graph.";
+ return answer;
+ }
+
+ DrawCoverageDropsCommand()
+ : DrawingCommand("draw_coverage_drops") {
+ }
+
+ void Execute(DebruijnEnvironment& curr_env, const ArgumentList& arg_list) const {
+ const vector<string>& args = arg_list.GetAllArguments();
+ if (!CheckCorrectness(args))
+ return;
+
+ std::string contigs_file = args[1];
+ if (!CheckFileExists(contigs_file)) {
+ LOG("File with contigs " << contigs_file << " not found");
+ return;
+ }
+
+ size_t contig_cnt = -1u;
+ if (args.size() > 2) {
+ LOG("Will analyze first " << args[2] << " contigs");
+ contig_cnt = std::stoll(args[2]);
+ }
+
+
+ io::FileReadStream reader(contigs_file);
+
+ size_t i = 0;
+ while (!reader.eof() && i < contig_cnt) {
+ io::SingleRead contig;
+ reader >> contig;
+ LOG("Considering contig " << contig.name());
+
+ std::vector<EdgeId> mapping_path = debruijn_graph::MapperInstance(curr_env.graph_pack())->MapRead(contig).simple_path();
+ std::vector<std::vector<EdgeId>> splitted_path = Split(curr_env.graph_pack(), mapping_path);
+ for(auto subpath : splitted_path) {
+ if (HasCoverageDrops(curr_env.graph_pack(), subpath)) {
+ LOG("Has coverage drops, drawing");
+ DrawPicturesAlongPath(curr_env, subpath, contig.name());
+ } else {
+ LOG("OK");
+ }
+ }
+
+ ++i;
+ }
+
+ }
+
+};
+
}
diff --git a/src/projects/online_vis/drawing_commands/drawing_command.hpp b/src/projects/online_vis/drawing_commands/drawing_command.hpp
index c393978..c825b7e 100644
--- a/src/projects/online_vis/drawing_commands/drawing_command.hpp
+++ b/src/projects/online_vis/drawing_commands/drawing_command.hpp
@@ -48,14 +48,9 @@ protected:
curr_env.picture_counter_++;
}
- void DrawPicturesAlongSequence(DebruijnEnvironment& curr_env, const Sequence& s, string label = "") const {
- DrawPicturesAlongPath(curr_env, curr_env.mapper().MapSequence(s).simple_path(), label);
- }
-
void DrawPicturesAlongContig(DebruijnEnvironment& curr_env, io::SingleRead contig) const {
- Sequence seq = contig.sequence();
string label = contig.name();
- DrawPicturesAlongSequence(curr_env, seq, label);
+ DrawPicturesAlongPath(curr_env, curr_env.mapper().MapRead(contig).simple_path(), label);
LOG("Contig " << contig.name() << " has been drawn");
}
diff --git a/src/projects/online_vis/position_commands/fill_position_command.hpp b/src/projects/online_vis/position_commands/fill_position_command.hpp
index b48b04b..604f926 100644
--- a/src/projects/online_vis/position_commands/fill_position_command.hpp
+++ b/src/projects/online_vis/position_commands/fill_position_command.hpp
@@ -50,16 +50,7 @@ namespace online_visualization {
string name = args[1];
string file = args[2];
- auto reader = make_shared<io::FixingWrapper>(make_shared<io::FileReadStream>(file));
-
- FillerClass& filler = curr_env.filler();
- while (!reader->eof()) {
- io::SingleRead read;
- (*reader) >> read;
- Sequence contig = read.sequence();
- filler.Process(contig, name + "_" + read.name());
- filler.Process(!contig, name + "_" + read.name() + "_RC");
- }
+ FillPos(curr_env.graph_pack(), file, name, true);
}
};
}
diff --git a/src/projects/online_vis/processing_commands.hpp b/src/projects/online_vis/processing_commands.hpp
index de8e60c..6d1a620 100644
--- a/src/projects/online_vis/processing_commands.hpp
+++ b/src/projects/online_vis/processing_commands.hpp
@@ -58,7 +58,7 @@ private:
std::ref(curr_env.graph_pack().edge_qual), std::placeholders::_1);
}
}
- debruijn::simplification::SimplifInfoContainer info;
+ debruijn::simplification::SimplifInfoContainer info(debruijn_graph::config::pipeline_type::base);
info.set_chunk_cnt(10);
debruijn::simplification::TipClipperInstance(curr_env.graph(), condition, info, (omnigraph::HandlerF<Graph>)nullptr)->Run();
}
diff --git a/src/projects/online_vis/vis_logger.hpp b/src/projects/online_vis/vis_logger.hpp
index 0b38093..42bd6a7 100644
--- a/src/projects/online_vis/vis_logger.hpp
+++ b/src/projects/online_vis/vis_logger.hpp
@@ -10,19 +10,19 @@
#undef INFO
#define INFO(message) \
{ \
- cout << __FILE__ << " " << __LINE__ << " ::: " << message << endl; \
+ std::cout << __FILE__ << " " << __LINE__ << " ::: " << message << std::endl; \
} \
#define LOG(message) \
{ \
- cout << message << endl; \
+ std::cout << message << endl; \
} \
//#define trace(message) LOG_MSG(logging::L_TRACE, message)
#define debug(print, message) \
{ \
if (print) { \
- cout << message << endl; \
+ std::cout << message << std::endl; \
} \
}
diff --git a/src/projects/scaffold_correction/scaffold_correction.hpp b/src/projects/scaffold_correction/scaffold_correction.hpp
index 8488bee..0237e6b 100644
--- a/src/projects/scaffold_correction/scaffold_correction.hpp
+++ b/src/projects/scaffold_correction/scaffold_correction.hpp
@@ -6,6 +6,7 @@
#pragma once
#include "io/reads_io/osequencestream.hpp"
+#include "io/reads_io/file_reader.hpp"
#include "pipeline/stage.hpp"
#include "pipeline/graph_pack.hpp"
#include "assembly_graph/paths/path_processor.hpp"
diff --git a/src/projects/spades/chromosome_removal.cpp b/src/projects/spades/chromosome_removal.cpp
index 6f5e067..f2282d5 100644
--- a/src/projects/spades/chromosome_removal.cpp
+++ b/src/projects/spades/chromosome_removal.cpp
@@ -177,7 +177,7 @@ void ChromosomeRemoval::PlasmidSimplify(conj_graph_pack &gp, size_t long_edge_bo
void ChromosomeRemoval::run(conj_graph_pack &gp, const char*) {
//FIXME Seriously?! cfg::get().ds like hundred times...
- OutputContigs(gp.g, cfg::get().output_dir + "before_chromosome_removal", false, 0, false);
+ OutputContigs(gp.g, cfg::get().output_dir + "before_chromosome_removal", false);
INFO("Before iteration " << 0 << ", " << gp.g.size() << " vertices in graph");
double chromosome_coverage = RemoveLongGenomicEdges(gp, cfg::get().pd->long_edge_length, cfg::get().pd->relative_coverage );
PlasmidSimplify(gp, cfg::get().pd->long_edge_length);
diff --git a/src/projects/spades/distance_estimation.cpp b/src/projects/spades/distance_estimation.cpp
index 481a457..ed6ebf2 100644
--- a/src/projects/spades/distance_estimation.cpp
+++ b/src/projects/spades/distance_estimation.cpp
@@ -226,7 +226,10 @@ void DistanceEstimation::run(conj_graph_pack &gp, const char*) {
INFO("Processing library #" << i);
estimate_distance(gp, cfg::get().ds.reads[i], gp.paired_indices[i], gp.clustered_indices[i], gp.scaffolding_indices[i]);
}
- gp.paired_indices[i].Clear();
+ if (!cfg::get().preserve_raw_paired_index) {
+ INFO("Clearing raw paired index");
+ gp.paired_indices[i].Clear();
+ }
}
}
diff --git a/src/projects/spades/gap_closer.cpp b/src/projects/spades/gap_closer.cpp
index 7c43178..e311945 100644
--- a/src/projects/spades/gap_closer.cpp
+++ b/src/projects/spades/gap_closer.cpp
@@ -491,9 +491,10 @@ void GapClosing::run(conj_graph_pack &gp, const char *) {
}
gp.EnsureIndex();
- for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i) {
- if (cfg::get().ds.reads[i].type() == io::LibraryType::PairedEnd) {
- auto streams = paired_binary_readers(cfg::get().ds.reads[i], false, 0);
+ auto& dataset = cfg::get_writable().ds;
+ for (size_t i = 0; i < dataset.reads.lib_count(); ++i) {
+ if (dataset.reads[i].type() == io::LibraryType::PairedEnd) {
+ auto streams = paired_binary_readers(dataset.reads[i], false, 0);
CloseGaps(gp, streams);
}
}
diff --git a/src/projects/spades/main.cpp b/src/projects/spades/main.cpp
index 5b66301..a14d4fa 100644
--- a/src/projects/spades/main.cpp
+++ b/src/projects/spades/main.cpp
@@ -10,6 +10,7 @@
*/
#include "dev_support/logger/log_writers.hpp"
+#include "dev_support/memory_limit.hpp"
#include "dev_support/segfault_handler.hpp"
#include "launch.hpp"
#include "dev_support/copy_file.hpp"
diff --git a/src/projects/spades/mismatch_correction.cpp b/src/projects/spades/mismatch_correction.cpp
index 6e6cae0..d19ffb2 100644
--- a/src/projects/spades/mismatch_correction.cpp
+++ b/src/projects/spades/mismatch_correction.cpp
@@ -14,12 +14,14 @@ namespace debruijn_graph {
void MismatchCorrection::run(conj_graph_pack &gp, const char*) {
gp.EnsureBasicMapping();
+
+ auto& dataset = cfg::get_writable().ds;
std::vector<size_t> libs;
- for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i) {
- if (cfg::get().ds.reads[i].is_mismatch_correctable())
+ for (size_t i = 0; i < dataset.reads.lib_count(); ++i) {
+ if (dataset.reads[i].is_mismatch_correctable())
libs.push_back(i);
}
- auto streams = single_binary_readers_for_libs(libs, true, true);
+ auto streams = single_binary_readers_for_libs(dataset, libs, true, true);
size_t corrected = MismatchShallNotPass<conj_graph_pack, io::SingleReadSeq>(gp, 2).ParallelStopAllMismatches(streams, 1);
INFO("Corrected " << corrected << " nucleotides");
}
diff --git a/src/projects/spades/pacbio_aligning.cpp b/src/projects/spades/pacbio_aligning.cpp
index 2c62e26..974251f 100644
--- a/src/projects/spades/pacbio_aligning.cpp
+++ b/src/projects/spades/pacbio_aligning.cpp
@@ -22,7 +22,7 @@ void ProcessReadsBatch(conj_graph_pack &gp,
vector<PathStorage<Graph> > long_reads_by_thread(cfg::get().max_threads,
PathStorage<Graph>(gp.g));
vector<pacbio::GapStorage<Graph> > gaps_by_thread(cfg::get().max_threads,
- pacbio::GapStorage<Graph>(gp.g, min_gap_quantity));
+ pacbio::GapStorage<Graph>(gp.g, min_gap_quantity,cfg::get().pb.long_seq_limit));
vector<pacbio::StatsCounter> stats_by_thread(cfg::get().max_threads);
size_t longer_500 = 0;
@@ -110,7 +110,7 @@ void align_pacbio(conj_graph_pack &gp, int lib_id, bool make_additional_saves) {
min_gap_quantity = cfg::get().pb.contigs_min_gap_quantity;
rtype = 2;
}
- pacbio::GapStorage<ConjugateDeBruijnGraph> gaps(gp.g, min_gap_quantity);
+ pacbio::GapStorage<ConjugateDeBruijnGraph> gaps(gp.g, min_gap_quantity, cfg::get().pb.long_seq_limit);
size_t read_buffer_size = 50000;
std::vector<io::SingleRead> reads(read_buffer_size);
io::SingleRead read;
@@ -118,7 +118,7 @@ void align_pacbio(conj_graph_pack &gp, int lib_id, bool make_additional_saves) {
INFO("Usign seed size: " << cfg::get().pb.pacbio_k);
pacbio::PacBioMappingIndex<ConjugateDeBruijnGraph> pac_index(gp.g,
cfg::get().pb.pacbio_k,
- cfg::get().K, cfg::get().pb.ignore_middle_alignment);
+ cfg::get().K, cfg::get().pb.ignore_middle_alignment, cfg::get().output_dir, cfg::get().pb);
// path_extend::ContigWriter cw(gp.g);
// cw.WriteEdges("before_rr_with_ids.fasta");
@@ -149,7 +149,7 @@ void align_pacbio(conj_graph_pack &gp, int lib_id, bool make_additional_saves) {
gaps.PadGapStrings();
if (make_additional_saves)
gaps.DumpToFile(cfg::get().output_saves + "gaps_padded.mpr");
- pacbio::PacbioGapCloser<Graph> gap_closer(gp.g, consensus_gap_closing);
+ pacbio::PacbioGapCloser<Graph> gap_closer(gp.g, consensus_gap_closing, cfg::get().pb.max_contigs_gap_length);
gap_closer.ConstructConsensus(cfg::get().max_threads, gaps);
gap_closer.CloseGapsInGraph(replacement);
long_reads.ReplaceEdges(replacement);
diff --git a/src/projects/spades/pair_info_count.cpp b/src/projects/spades/pair_info_count.cpp
index 79a85d3..bc01e1d 100644
--- a/src/projects/spades/pair_info_count.cpp
+++ b/src/projects/spades/pair_info_count.cpp
@@ -17,7 +17,6 @@
namespace debruijn_graph {
-typedef io::SequencingLibrary<config::DataSetData> SequencingLib;
bool RefineInsertSizeForLib(conj_graph_pack &gp, size_t ilib, size_t edge_length_threshold) {
@@ -26,9 +25,10 @@ bool RefineInsertSizeForLib(conj_graph_pack &gp, size_t ilib, size_t edge_length
SequenceMapperNotifier notifier(gp);
notifier.Subscribe(ilib, &hist_counter);
- SequencingLib &reads = cfg::get_writable().ds.reads[ilib];
- VERIFY(reads.data().read_length != 0);
+ auto& reads = cfg::get_writable().ds.reads[ilib];
auto paired_streams = paired_binary_readers(reads, false);
+
+ VERIFY(reads.data().read_length != 0);
notifier.ProcessLibrary(paired_streams, ilib, *ChooseProperMapper(gp, reads));
INFO(hist_counter.mapped() << " paired reads (" <<
@@ -56,7 +56,7 @@ bool RefineInsertSizeForLib(conj_graph_pack &gp, size_t ilib, size_t edge_length
void ProcessSingleReads(conj_graph_pack &gp, size_t ilib,
bool use_binary = true) {
- const SequencingLib &reads = cfg::get().ds.reads[ilib];
+ auto& reads = cfg::get_writable().ds.reads[ilib];
SequenceMapperNotifier notifier(gp);
GappedLongReadMapper read_mapper(gp, gp.single_long_reads[ilib]);
SimpleLongReadMapper simple_read_mapper(gp, gp.single_long_reads[ilib]);
@@ -80,7 +80,7 @@ void ProcessSingleReads(conj_graph_pack &gp, size_t ilib,
}
void ProcessPairedReads(conj_graph_pack &gp, size_t ilib, bool map_single_reads) {
- const SequencingLib &reads = cfg::get().ds.reads[ilib];
+ auto& reads = cfg::get_writable().ds.reads[ilib];
bool calculate_threshold = (reads.type() == io::LibraryType::PairedEnd);
SequenceMapperNotifier notifier(gp);
INFO("Left insert size qauntile " << reads.data().insert_size_left_quantile <<
@@ -233,7 +233,7 @@ void PairInfoCount::run(conj_graph_pack &gp, const char *) {
cfg::get_writable().use_single_reads |= map_single_reads;
if(cfg::get().mode == debruijn_graph::config::pipeline_type::meta
- && cfg::get().use_single_reads) {
+ && cfg::get().use_single_reads) {
map_single_reads = false;
cfg::get_writable().use_single_reads = false;
WARN("Single reads mappings are not used in metagenomic mode");
diff --git a/src/projects/spades/repeat_resolving.cpp b/src/projects/spades/repeat_resolving.cpp
index 9a5424a..e5044d8 100644
--- a/src/projects/spades/repeat_resolving.cpp
+++ b/src/projects/spades/repeat_resolving.cpp
@@ -19,14 +19,20 @@
namespace debruijn_graph {
void PEResolving(conj_graph_pack& gp) {
- vector<size_t> indexes;
- std::string name = "scaffolds";
- bool traverse_loops = true;
- if (!(cfg::get().use_scaffolder && cfg::get().pe_params.param_set.scaffolder_options.on)) {
- name = "final_contigs";
- traverse_loops = false;
- }
- path_extend::ResolveRepeatsPe(gp, cfg::get().output_dir, name, traverse_loops, boost::optional<std::string>("final_contigs"));
+ string scaffolds_name = cfg::get().mode == config::pipeline_type::rna ? "transcripts" : "scaffolds";
+ bool output_broke_scaffolds = cfg::get().mode != config::pipeline_type::rna;
+
+ path_extend::PathExtendParamsContainer params(cfg::get().pe_params,
+ cfg::get().output_dir,
+ "final_contigs",
+ scaffolds_name,
+ cfg::get().mode,
+ cfg::get().uneven_depth,
+ cfg::get().avoid_rc_connections,
+ cfg::get().use_scaffolder,
+ output_broke_scaffolds);
+
+ path_extend::ResolveRepeatsPe(cfg::get().ds, params, gp);
}
inline bool HasValidLibs() {
@@ -55,7 +61,7 @@ void RepeatResolution::run(conj_graph_pack &gp, const char*) {
INFO("Setting up preliminary path extend settings")
cfg::get_writable().pe_params = *cfg::get().prelim_pe_params;
}
- OutputContigs(gp.g, cfg::get().output_dir + "before_rr", false, 0, false);
+ OutputContigs(gp.g, cfg::get().output_dir + "before_rr", false);
OutputContigsToFASTG(gp.g, cfg::get().output_dir + "assembly_graph",gp.components);
bool no_valid_libs = !HasValidLibs();
@@ -65,7 +71,7 @@ void RepeatResolution::run(conj_graph_pack &gp, const char*) {
WARN("Insert size was not estimated for any of the paired libraries, repeat resolution module will not run.");
if ((no_valid_libs || cfg::get().rm == config::resolving_mode::none) && !use_single_reads) {
- OutputContigs(gp.g, cfg::get().output_dir + "final_contigs", false, 0, false);
+ OutputContigs(gp.g, cfg::get().output_dir + "final_contigs", false);
return;
}
if (cfg::get().rm == config::resolving_mode::path_extend) {
@@ -73,7 +79,7 @@ void RepeatResolution::run(conj_graph_pack &gp, const char*) {
PEResolving(gp);
} else {
INFO("Unsupported repeat resolver");
- OutputContigs(gp.g, cfg::get().output_dir + "final_contigs", false, 0, false);
+ OutputContigs(gp.g, cfg::get().output_dir + "final_contigs", false);
}
if (preliminary_) {
INFO("Restoring initial path extend settings")
@@ -82,11 +88,10 @@ void RepeatResolution::run(conj_graph_pack &gp, const char*) {
}
void ContigOutput::run(conj_graph_pack &gp, const char*) {
- OutputContigs(gp.g, cfg::get().output_dir + "simplified_contigs", cfg::get().use_unipaths,
- cfg::get().simp.tec.plausibility_length, false);
- OutputContigs(gp.g, cfg::get().output_dir + "before_rr", false, 0, false);
+ OutputContigs(gp.g, cfg::get().output_dir + "simplified_contigs", cfg::get().use_unipaths);
+ OutputContigs(gp.g, cfg::get().output_dir + "before_rr", false);
OutputContigsToFASTG(gp.g, cfg::get().output_dir + "assembly_graph", gp.components);
- OutputContigs(gp.g, cfg::get().output_dir + "final_contigs", false, 0, false);
+ OutputContigs(gp.g, cfg::get().output_dir + "final_contigs", false);
}
diff --git a/src/projects/truseq_analysis/analysis_pipeline.cpp b/src/projects/truseq_analysis/analysis_pipeline.cpp
index c7ef6a5..413e6cc 100644
--- a/src/projects/truseq_analysis/analysis_pipeline.cpp
+++ b/src/projects/truseq_analysis/analysis_pipeline.cpp
@@ -12,6 +12,7 @@
#include "stages/construction.hpp"
#include "dev_support/standard_base.hpp"
#include "analysis_pipeline.hpp"
+#include "modules/io/reads_io/file_reader.hpp"
spades::VariationDetectionStage::VariationDetectionStage(string output_file, const Config &config) : AssemblyStage("VariationDetection", "variation_detection"),
output_file_(output_file), config_(config) {
diff --git a/src/spades_pipeline/options_storage.py b/src/spades_pipeline/options_storage.py
index 01fb868..92e6580 100644
--- a/src/spades_pipeline/options_storage.py
+++ b/src/spades_pipeline/options_storage.py
@@ -35,6 +35,8 @@ scaffolds_name = "scaffolds.fasta"
assembly_graph_name = "assembly_graph.fastg"
contigs_paths = "contigs.paths"
scaffolds_paths = "scaffolds.paths"
+transcripts_name = "transcripts.fasta"
+transcripts_paths = "transcripts.paths"
#other constants
MIN_K = 1
@@ -45,6 +47,7 @@ THRESHOLD_FOR_BREAKING_ADDITIONAL_CONTIGS = 10
#default values constants
THREADS = 16
MEMORY = 250
+K_MERS_RNA = [55]
K_MERS_SHORT = [21,33,55]
K_MERS_150 = [21,33,55,77]
K_MERS_250 = [21,33,55,77,99,127]
@@ -181,10 +184,11 @@ def usage(spades_version, show_hidden=False, mode=None):
sys.stderr.write("" + "\n")
sys.stderr.write("Basic options:" + "\n")
sys.stderr.write("-o\t<output_dir>\tdirectory to store all the resulting files (required)" + "\n")
- if mode != "dip":
+ if mode is None: # nothing special, just regular spades.py
sys.stderr.write("--sc\t\t\tthis flag is required for MDA (single-cell) data" + "\n")
sys.stderr.write("--meta\t\t\tthis flag is required for metagenomic sample data" + "\n")
- sys.stderr.write("--plasmid\tRuns plasmidSPAdes pipeline for plasmid detection \n");
+ sys.stderr.write("--rna\t\t\tthis flag is required for RNA-Seq data \n")
+ sys.stderr.write("--plasmid\t\truns plasmidSPAdes pipeline for plasmid detection \n")
sys.stderr.write("--iontorrent\t\tthis flag is required for IonTorrent data" + "\n")
sys.stderr.write("--test\t\t\truns SPAdes on toy dataset" + "\n")
@@ -213,33 +217,34 @@ def usage(spades_version, show_hidden=False, mode=None):
" for paired-end library number <#> (<#> = 1,2,..,9; <or> = fr, rf, ff)" + "\n")
sys.stderr.write("--s<#>\t\t<filename>\tfile with unpaired reads"\
" for single reads library number <#> (<#> = 1,2,..,9)" + "\n")
- sys.stderr.write("--mp<#>-12\t<filename>\tfile with interlaced"\
- " reads for mate-pair library number <#> (<#> = 1,2,..,9)" + "\n")
- sys.stderr.write("--mp<#>-1\t<filename>\tfile with forward reads"\
- " for mate-pair library number <#> (<#> = 1,2,..,9)" + "\n")
- sys.stderr.write("--mp<#>-2\t<filename>\tfile with reverse reads"\
- " for mate-pair library number <#> (<#> = 1,2,..,9)" + "\n")
- sys.stderr.write("--mp<#>-s\t<filename>\tfile with unpaired reads"\
- " for mate-pair library number <#> (<#> = 1,2,..,9)" + "\n")
- sys.stderr.write("--mp<#>-<or>\torientation of reads"\
- " for mate-pair library number <#> (<#> = 1,2,..,9; <or> = fr, rf, ff)" + "\n")
- sys.stderr.write("--hqmp<#>-12\t<filename>\tfile with interlaced"\
- " reads for high-quality mate-pair library number <#> (<#> = 1,2,..,9)" + "\n")
- sys.stderr.write("--hqmp<#>-1\t<filename>\tfile with forward reads"\
- " for high-quality mate-pair library number <#> (<#> = 1,2,..,9)" + "\n")
- sys.stderr.write("--hqmp<#>-2\t<filename>\tfile with reverse reads"\
- " for high-quality mate-pair library number <#> (<#> = 1,2,..,9)" + "\n")
- sys.stderr.write("--hqmp<#>-s\t<filename>\tfile with unpaired reads"\
- " for high-quality mate-pair library number <#> (<#> = 1,2,..,9)" + "\n")
- sys.stderr.write("--hqmp<#>-<or>\torientation of reads"\
- " for high-quality mate-pair library number <#> (<#> = 1,2,..,9; <or> = fr, rf, ff)" + "\n")
- sys.stderr.write("--nxmate<#>-1\t<filename>\tfile with forward reads"\
- " for Lucigen NxMate library number <#> (<#> = 1,2,..,9)" + "\n")
- sys.stderr.write("--nxmate<#>-2\t<filename>\tfile with reverse reads"\
- " for Lucigen NxMate library number <#> (<#> = 1,2,..,9)" + "\n")
- sys.stderr.write("--sanger\t<filename>\tfile with Sanger reads\n")
- sys.stderr.write("--pacbio\t<filename>\tfile with PacBio reads\n")
- sys.stderr.write("--nanopore\t<filename>\tfile with Nanopore reads\n")
+ if mode not in ["rna", "meta"]:
+ sys.stderr.write("--mp<#>-12\t<filename>\tfile with interlaced"\
+ " reads for mate-pair library number <#> (<#> = 1,2,..,9)" + "\n")
+ sys.stderr.write("--mp<#>-1\t<filename>\tfile with forward reads"\
+ " for mate-pair library number <#> (<#> = 1,2,..,9)" + "\n")
+ sys.stderr.write("--mp<#>-2\t<filename>\tfile with reverse reads"\
+ " for mate-pair library number <#> (<#> = 1,2,..,9)" + "\n")
+ sys.stderr.write("--mp<#>-s\t<filename>\tfile with unpaired reads"\
+ " for mate-pair library number <#> (<#> = 1,2,..,9)" + "\n")
+ sys.stderr.write("--mp<#>-<or>\torientation of reads"\
+ " for mate-pair library number <#> (<#> = 1,2,..,9; <or> = fr, rf, ff)" + "\n")
+ sys.stderr.write("--hqmp<#>-12\t<filename>\tfile with interlaced"\
+ " reads for high-quality mate-pair library number <#> (<#> = 1,2,..,9)" + "\n")
+ sys.stderr.write("--hqmp<#>-1\t<filename>\tfile with forward reads"\
+ " for high-quality mate-pair library number <#> (<#> = 1,2,..,9)" + "\n")
+ sys.stderr.write("--hqmp<#>-2\t<filename>\tfile with reverse reads"\
+ " for high-quality mate-pair library number <#> (<#> = 1,2,..,9)" + "\n")
+ sys.stderr.write("--hqmp<#>-s\t<filename>\tfile with unpaired reads"\
+ " for high-quality mate-pair library number <#> (<#> = 1,2,..,9)" + "\n")
+ sys.stderr.write("--hqmp<#>-<or>\torientation of reads"\
+ " for high-quality mate-pair library number <#> (<#> = 1,2,..,9; <or> = fr, rf, ff)" + "\n")
+ sys.stderr.write("--nxmate<#>-1\t<filename>\tfile with forward reads"\
+ " for Lucigen NxMate library number <#> (<#> = 1,2,..,9)" + "\n")
+ sys.stderr.write("--nxmate<#>-2\t<filename>\tfile with reverse reads"\
+ " for Lucigen NxMate library number <#> (<#> = 1,2,..,9)" + "\n")
+ sys.stderr.write("--sanger\t<filename>\tfile with Sanger reads\n")
+ sys.stderr.write("--pacbio\t<filename>\tfile with PacBio reads\n")
+ sys.stderr.write("--nanopore\t<filename>\tfile with Nanopore reads\n")
sys.stderr.write("--trusted-contigs\t<filename>\tfile with trusted contigs\n")
sys.stderr.write("--untrusted-contigs\t<filename>\tfile with untrusted contigs\n")
if mode == "dip":
@@ -254,8 +259,8 @@ def usage(spades_version, show_hidden=False, mode=None):
sys.stderr.write("--only-assembler\truns only assembling (without read error"\
" correction)" + "\n")
if mode != "dip":
- sys.stderr.write("--careful\t\ttries to reduce number"\
- " of mismatches and short indels" + "\n")
+ if mode not in ["rna", "meta"]:
+ sys.stderr.write("--careful\t\ttries to reduce number of mismatches and short indels" + "\n")
sys.stderr.write("--continue\t\tcontinue run from the last available check-point" + "\n")
sys.stderr.write("--restart-from\t<cp>\trestart run with updated options and from the specified check-point ('ec', 'as', 'k<int>', 'mc')" + "\n")
sys.stderr.write("--disable-gzip-output\tforces error correction not to"\
@@ -280,11 +285,17 @@ def usage(spades_version, show_hidden=False, mode=None):
sys.stderr.write("\t\t\t\t[default: %s]\n" % MEMORY)
sys.stderr.write("--tmp-dir\t<dirname>\tdirectory for temporary files" + "\n")
sys.stderr.write("\t\t\t\t[default: <output_dir>/tmp]" + "\n")
- sys.stderr.write("-k\t\t<int,int,...>\tcomma-separated list of k-mer sizes"\
+ if mode != 'rna':
+ sys.stderr.write("-k\t\t<int,int,...>\tcomma-separated list of k-mer sizes" \
" (must be odd and" + "\n")
- sys.stderr.write("\t\t\t\tless than " + str(MAX_K + 1) + ") [default: 'auto']" + "\n")
- sys.stderr.write("--cov-cutoff\t<float>\t\tcoverage cutoff value (a positive float number, "
- "or 'auto', or 'off') [default: 'off']" + "\n")
+ sys.stderr.write("\t\t\t\tless than " + str(MAX_K + 1) + ") [default: 'auto']" + "\n")
+ else:
+ sys.stderr.write("-k\t\t<int>\t\tk-mer size (must be odd and less than " + str(MAX_K + 1) + ") " \
+ "[default: " + str(K_MERS_RNA[0]) + "]\n")
+
+ if mode not in ["rna", "meta"]:
+ sys.stderr.write("--cov-cutoff\t<float>\t\tcoverage cutoff value (a positive float number, "
+ "or 'auto', or 'off') [default: 'off']" + "\n")
sys.stderr.write("--phred-offset\t<33 or 64>\tPHRED quality offset in the"\
" input reads (33 or 64)" + "\n")
sys.stderr.write("\t\t\t\t[default: auto-detect]" + "\n")
@@ -307,8 +318,7 @@ def usage(spades_version, show_hidden=False, mode=None):
" for BayesHammer" + "\n")
sys.stderr.write("--spades-heap-check\t<value>\tsets HEAPCHECK environment variable"\
" for SPAdes" + "\n")
- sys.stderr.write("--large-genome\tEnables optimizations for large genomes \n");
- sys.stderr.write("--rna\tRuns rnaSPAdes pipeline for RNA-Seq data \n");
+ sys.stderr.write("--large-genome\tEnables optimizations for large genomes \n")
sys.stderr.write("--help-hidden\tprints this usage message with all hidden options" + "\n")
if show_hidden and mode == "dip":
diff --git a/src/spades_pipeline/spades_logic.py b/src/spades_pipeline/spades_logic.py
index c04f5e9..1aafd6b 100644
--- a/src/spades_pipeline/spades_logic.py
+++ b/src/spades_pipeline/spades_logic.py
@@ -19,6 +19,7 @@ import options_storage
BASE_STAGE = "construction"
READS_TYPES_USED_IN_CONSTRUCTION = ["paired-end", "single", "hq-mate-pairs"]
+READS_TYPES_USED_IN_RNA_SEQ = ["paired-end", "single", "trusted-contigs", "untrusted-contigs"]
def prepare_config_spades(filename, cfg, log, additional_contigs_fname, K, stage, saves_dir, last_one, execution_home):
@@ -94,13 +95,13 @@ def update_k_mers_in_special_cases(cur_k_mers, RL, log, silent=False):
if RL >= 150:
if not silent:
log.info("Default k-mer sizes were set to %s because estimated "
- "read length (%d) is equal to or greater than 150" % (str(options_storage.K_MERS_150), RL), log)
+ "read length (%d) is equal to or greater than 150" % (str(options_storage.K_MERS_150), RL))
return options_storage.K_MERS_150
if RL <= max(cur_k_mers):
new_k_mers = [k for k in cur_k_mers if k < RL]
if not silent:
log.info("K-mer sizes were set to %s because estimated "
- "read length (%d) is less than %d" % (str(new_k_mers), RL, max(cur_k_mers)), log)
+ "read length (%d) is less than %d" % (str(new_k_mers), RL, max(cur_k_mers)))
return new_k_mers
return cur_k_mers
@@ -339,26 +340,35 @@ def run_spades(configs_dir, execution_home, cfg, dataset_data, ext_python_module
result_before_rr_contigs = os.path.join(os.path.dirname(cfg.result_contigs), "before_rr.fasta")
if not os.path.isfile(result_before_rr_contigs) or not options_storage.continue_mode:
shutil.copyfile(os.path.join(latest, "before_rr.fasta"), result_before_rr_contigs)
- if os.path.isfile(os.path.join(latest, "final_contigs.fasta")):
- if not os.path.isfile(cfg.result_contigs) or not options_storage.continue_mode:
- shutil.copyfile(os.path.join(latest, "final_contigs.fasta"), cfg.result_contigs)
- if os.path.isfile(os.path.join(latest, "first_pe_contigs.fasta")):
- result_first_pe_contigs = os.path.join(os.path.dirname(cfg.result_contigs), "first_pe_contigs.fasta")
- if not os.path.isfile(result_first_pe_contigs) or not options_storage.continue_mode:
- shutil.copyfile(os.path.join(latest, "first_pe_contigs.fasta"), result_first_pe_contigs)
- if cfg.rr_enable:
- if os.path.isfile(os.path.join(latest, "scaffolds.fasta")):
- if not os.path.isfile(cfg.result_scaffolds) or not options_storage.continue_mode:
- shutil.copyfile(os.path.join(latest, "scaffolds.fasta"), cfg.result_scaffolds)
- if os.path.isfile(os.path.join(latest, "assembly_graph.fastg")):
- if not os.path.isfile(cfg.result_graph) or not options_storage.continue_mode:
- shutil.copyfile(os.path.join(latest, "assembly_graph.fastg"), cfg.result_graph)
- if os.path.isfile(os.path.join(latest, "final_contigs.paths")):
- if not os.path.isfile(cfg.result_contigs_paths) or not options_storage.continue_mode:
- shutil.copyfile(os.path.join(latest, "final_contigs.paths"), cfg.result_contigs_paths)
- if os.path.isfile(os.path.join(latest, "scaffolds.paths")):
- if not os.path.isfile(cfg.result_scaffolds_paths) or not options_storage.continue_mode:
- shutil.copyfile(os.path.join(latest, "scaffolds.paths"), cfg.result_scaffolds_paths)
+ if options_storage.rna:
+ if os.path.isfile(os.path.join(latest, "transcripts.fasta")):
+ if not os.path.isfile(cfg.result_transcripts) or not options_storage.continue_mode:
+ shutil.copyfile(os.path.join(latest, "transcripts.fasta"), cfg.result_transcripts)
+ if os.path.isfile(os.path.join(latest, "transcripts.paths")):
+ if not os.path.isfile(cfg.result_transcripts_paths) or not options_storage.continue_mode:
+ shutil.copyfile(os.path.join(latest, "transcripts.paths"), cfg.result_transcripts_paths)
+ else:
+ if os.path.isfile(os.path.join(latest, "final_contigs.fasta")):
+ if not os.path.isfile(cfg.result_contigs) or not options_storage.continue_mode:
+ shutil.copyfile(os.path.join(latest, "final_contigs.fasta"), cfg.result_contigs)
+ if os.path.isfile(os.path.join(latest, "first_pe_contigs.fasta")):
+ result_first_pe_contigs = os.path.join(os.path.dirname(cfg.result_contigs), "first_pe_contigs.fasta")
+ if not os.path.isfile(result_first_pe_contigs) or not options_storage.continue_mode:
+ shutil.copyfile(os.path.join(latest, "first_pe_contigs.fasta"), result_first_pe_contigs)
+ if cfg.rr_enable:
+ if os.path.isfile(os.path.join(latest, "scaffolds.fasta")):
+ if not os.path.isfile(cfg.result_scaffolds) or not options_storage.continue_mode:
+ shutil.copyfile(os.path.join(latest, "scaffolds.fasta"), cfg.result_scaffolds)
+ if os.path.isfile(os.path.join(latest, "scaffolds.paths")):
+ if not os.path.isfile(cfg.result_scaffolds_paths) or not options_storage.continue_mode:
+ shutil.copyfile(os.path.join(latest, "scaffolds.paths"), cfg.result_scaffolds_paths)
+ if os.path.isfile(os.path.join(latest, "assembly_graph.fastg")):
+ if not os.path.isfile(cfg.result_graph) or not options_storage.continue_mode:
+ shutil.copyfile(os.path.join(latest, "assembly_graph.fastg"), cfg.result_graph)
+ if os.path.isfile(os.path.join(latest, "final_contigs.paths")):
+ if not os.path.isfile(cfg.result_contigs_paths) or not options_storage.continue_mode:
+ shutil.copyfile(os.path.join(latest, "final_contigs.paths"), cfg.result_contigs_paths)
+
if cfg.developer_mode:
# saves
diff --git a/src/utils/adt/bf.hpp b/src/utils/adt/bf.hpp
new file mode 100644
index 0000000..1c9ef92
--- /dev/null
+++ b/src/utils/adt/bf.hpp
@@ -0,0 +1,174 @@
+#pragma once
+
+#include <functional>
+#include <vector>
+#include <atomic>
+
+#include <cassert>
+
+namespace bf {
+
+/// The counting Bloom filter.
+template<class T, unsigned width_ = 4>
+class counting_bloom_filter {
+ counting_bloom_filter(const counting_bloom_filter&) = delete;
+ counting_bloom_filter& operator=(const counting_bloom_filter&) = delete;
+
+protected:
+ static constexpr uint64_t cell_mask_ = (1ull << width_) - 1;
+ static constexpr size_t cells_per_entry_ = 8 * sizeof(uint64_t) / width_;
+
+public:
+ /// The hash digest type.
+ typedef size_t digest;
+
+ /// The hash function type.
+ typedef std::function<digest(const T&, uint64_t seed)> hasher;
+
+ counting_bloom_filter() = default;
+ ~counting_bloom_filter() = default;
+
+ /// Constructs a counting Bloom filter.
+ /// @param h The hasher.
+ /// @param cells The number of cells.
+ /// @param num_hashes The number of hash functions to use
+ /// The memory consumption will be cells * width bits
+ counting_bloom_filter(hasher h,
+ size_t cells, size_t num_hashes = 3)
+ : hasher_(std::move(h)),
+ num_hashes_(num_hashes),
+ cells_(cells),
+ data_((cells * width_ + 8 * sizeof(uint64_t) - 1)/ 8 / sizeof(uint64_t)) {
+ static_assert((width_ & (width_ - 1)) == 0, "Width must be power of two");
+ }
+
+ /// Move-constructs a counting Bloom filter.
+ counting_bloom_filter(counting_bloom_filter&&) = default;
+
+ /// Adds an element to the Bloom filter.
+ /// @tparam T The type of the element to insert.
+ /// @param x An instance of type `T`.
+ void add(const T &o) {
+ for (size_t i = 0; i < num_hashes_; ++i) {
+ digest d = hasher_(o, i);
+ size_t cell_id = d - cells_ * (d / cells_); // Use division here in order to test stuff like libidivide
+ size_t pos = cell_id / cells_per_entry_;
+ size_t epos = cell_id - pos * cells_per_entry_;
+ auto &entry = data_[pos];
+ uint64_t mask = cell_mask_ << (width_ * epos);
+
+ // Add counter
+ while (true) {
+ uint64_t val = entry.load();
+
+ // Overflow, do nothing
+ if ((val & mask) == mask)
+ break;
+
+ uint64_t newval = val + (1ull << (width_ * epos));
+ if (!entry.compare_exchange_strong(val, newval))
+ continue;
+
+ break;
+ }
+
+ }
+ }
+
+ /// Retrieves the count of an element.
+ /// @tparam T The type of the element to query.
+ /// @param x An instance of type `T`.
+ /// @return A frequency estimate for *x*.
+ size_t lookup(const T &o) const {
+ size_t val = (1ull << width_) - 1;
+ for (size_t i = 0; i < num_hashes_; ++i) {
+ digest d = hasher_(o, i);
+ size_t cell_id = d - cells_ * (d / cells_); // Use division here in order to test stuff like libidivide
+ size_t pos = cell_id / cells_per_entry_;
+ size_t epos = cell_id - pos * cells_per_entry_;
+ size_t cval = (data_[pos] >> (width_ * epos)) & cell_mask_;
+ if (val > cval)
+ val = cval;
+ }
+
+ return val;
+ }
+
+ /// Removes all items from the Bloom filter.
+ void clear() {
+ std::fill(data_.begin(), data_.end(), 0);
+ }
+
+protected:
+ hasher hasher_;
+ size_t num_hashes_;
+ size_t cells_;
+ std::vector<std::atomic<uint64_t>> data_;
+};
+
+/// The counting Bloom filter.
+template<class T, unsigned width_ = 4>
+class bitcounting_bloom_filter : public counting_bloom_filter<T, width_> {
+ using typename counting_bloom_filter<T, width_>::digest;
+ using typename counting_bloom_filter<T, width_>::hasher;
+
+ public:
+ bitcounting_bloom_filter(hasher h,
+ size_t cells, size_t num_hashes = 3)
+ : counting_bloom_filter<T, width_>(h, cells, num_hashes) {}
+
+ /// Adds an element to the Bloom filter.
+ /// @tparam T The type of the element to insert.
+ /// @param x An instance of type `T`.
+ void add(const T &o) {
+ for (size_t i = 0; i < this->num_hashes_; ++i) {
+ digest d = this->hasher_(o, i);
+ size_t cell_id = d - this->cells_ * (d / this->cells_); // Use division here in order to test stuff like libidivide
+ size_t pos = cell_id / this->cells_per_entry_;
+ size_t epos = cell_id - pos * this->cells_per_entry_;
+ auto &entry = this->data_[pos];
+ uint64_t mask = this->cell_mask_ << (width_ * epos);
+
+ // Add counter
+ while (true) {
+ uint64_t val = entry.load() & mask;
+
+ // Overflow, do nothing
+ if (val == mask)
+ break;
+
+ uint64_t cellval = val >> width_ * epos;
+ size_t cnt = (cellval == 0 ? 0 : 64 - __builtin_clzll(cellval)) + width_ * epos;
+
+ if ((std::atomic_fetch_or(&entry, uint64_t(1) << cnt) & mask) != val)
+ continue;
+
+ break;
+ }
+ }
+ }
+
+ /// Retrieves the count of an element.
+ /// @tparam T The type of the element to query.
+ /// @param x An instance of type `T`.
+ /// @return A frequency estimate for *x*.
+ size_t lookup(const T &o) const {
+ size_t val = (1ull << width_) - 1;
+ for (size_t i = 0; i < this->num_hashes_; ++i) {
+ digest d = this->hasher_(o, i);
+ size_t cell_id = d - this->cells_ * (d / this->cells_); // Use division here in order to test stuff like libidivide
+ size_t pos = cell_id / this->cells_per_entry_;
+ size_t epos = cell_id - pos * this->cells_per_entry_;
+ uint64_t entry = (this->data_[pos] >> (width_ * epos)) & this->cell_mask_;
+ size_t cval = (entry == 0 ? 0 : 64 - __builtin_clzll(entry));
+
+ if (val > cval)
+ val = cval;
+ }
+
+ return val;
+ }
+};
+
+
+} // namespace bf
diff --git a/src/utils/adt/hll.hpp b/src/utils/adt/hll.hpp
new file mode 100644
index 0000000..ab24fbe
--- /dev/null
+++ b/src/utils/adt/hll.hpp
@@ -0,0 +1,69 @@
+#pragma once
+
+#include <vector>
+#include <functional>
+#include <numeric>
+#include <cmath>
+
+namespace hll {
+ template<class T, unsigned precision = 24>
+ class hll {
+ static constexpr uint64_t m_ = 1ull << precision;
+ static constexpr uint64_t mask_ = (m_ - 1) << (64 - precision);
+
+ constexpr double alpha(unsigned p) const {
+ // constexpr switches are C++14 only :(
+ return (p > 6 ?
+ 0.7213 / (1.0 + 1.079 / double(1ull << p)) :
+ p == 6 ? 0.709 : p == 5 ? 0.697 : 0.673);
+ }
+
+ public:
+ /// The hash digest type.
+ typedef uint64_t digest;
+
+ /// The hash function type.
+ typedef std::function<digest(const T)> hasher;
+
+ hll(hasher h)
+ : hasher_(std::move(h)), data_(1ull << precision, 0)
+ { }
+
+
+ /// @tparam T The type of the element to insert.
+ /// @param x An instance of type `T`.
+ void add(const T &o) {
+ digest d = hasher_(o);
+
+ // Split digest into parts
+ size_t id = (d & mask_) >> (64 - precision);
+ uint8_t rho = uint8_t(((d & ~mask_) == 0 ? 64 : __builtin_clzll(d & ~mask_)) - precision + 1);
+ if (data_[id] < rho)
+ data_[id] = rho;
+ }
+
+ void merge(const hll &other) {
+ for (size_t i = 0; i < data_.size(); ++i)
+ data_[i] = std::max(data_[i], other.data_[i]);
+ }
+
+ std::pair<double, bool> cardinality() const {
+ // FIXME: Precision loss?
+ // FIXME: Bias correction!
+ double res = alpha(precision) * m_ * m_;
+ double E = std::accumulate(data_.begin(), data_.end(),
+ 0.0, [](double a, uint8_t b) { return a + exp2(-(double)b); });
+ res /= E;
+ return { res, res > 5.0 * m_/2 };
+ }
+
+ void clear() {
+ std::fill(data_.begin(), data_.end(), 0);
+ }
+
+ private:
+ hasher hasher_;
+ std::vector<uint8_t> data_;
+ };
+
+} //namespace hll
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/spades.git
More information about the debian-med-commit
mailing list